Alogomachine's BLOG


  • Home

  • Archives

SVM补充和肿瘤预测案例

Posted on 2021-07-30 | In 机器学习 , 算法

机器学习算法支持向量机SVM

SVM

公式与优势

image-20210730224149091

image-20210730224119817

优势:

  • 理论完美
  • 支持不同的Kernel,用于调参

缺点:

  • 当数据量大的时候,训练比较慢

||W|| : 范数

核函数

解决非线性的方法-核函数

  • 在非线性图像内运用
  • 映射方式不同

解决多分类

解决多分类:

  • 一对多法( 其他形成一类)
  • 一对一法

肿瘤预测

1
2
3
import pandas as pd

data = pd.read_csv('data.csv')
1
2
3
pd.set_option('display.max_columns',None)
data.columns
data.describe()

diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se area_se smoothness_se compactness_se concavity_se concave points_se symmetry_se fractal_dimension_se radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst
count 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000
mean 0.372583 14.127292 19.289649 91.969033 654.889104 0.096360 0.104341 0.088799 0.048919 0.181162 0.062798 0.405172 1.216853 2.866059 40.337079 0.007041 0.025478 0.031894 0.011796 0.020542 0.003795 16.269190 25.677223 107.261213 880.583128 0.132369 0.254265 0.272188 0.114606 0.290076 0.083946
std 0.483918 3.524049 4.301036 24.298981 351.914129 0.014064 0.052813 0.079720 0.038803 0.027414 0.007060 0.277313 0.551648 2.021855 45.491006 0.003003 0.017908 0.030186 0.006170 0.008266 0.002646 4.833242 6.146258 33.602542 569.356993 0.022832 0.157336 0.208624 0.065732 0.061867 0.018061
min 0.000000 6.981000 9.710000 43.790000 143.500000 0.052630 0.019380 0.000000 0.000000 0.106000 0.049960 0.111500 0.360200 0.757000 6.802000 0.001713 0.002252 0.000000 0.000000 0.007882 0.000895 7.930000 12.020000 50.410000 185.200000 0.071170 0.027290 0.000000 0.000000 0.156500 0.055040
25% 0.000000 11.700000 16.170000 75.170000 420.300000 0.086370 0.064920 0.029560 0.020310 0.161900 0.057700 0.232400 0.833900 1.606000 17.850000 0.005169 0.013080 0.015090 0.007638 0.015160 0.002248 13.010000 21.080000 84.110000 515.300000 0.116600 0.147200 0.114500 0.064930 0.250400 0.071460
50% 0.000000 13.370000 18.840000 86.240000 551.100000 0.095870 0.092630 0.061540 0.033500 0.179200 0.061540 0.324200 1.108000 2.287000 24.530000 0.006380 0.020450 0.025890 0.010930 0.018730 0.003187 14.970000 25.410000 97.660000 686.500000 0.131300 0.211900 0.226700 0.099930 0.282200 0.080040
75% 1.000000 15.780000 21.800000 104.100000 782.700000 0.105300 0.130400 0.130700 0.074000 0.195700 0.066120 0.478900 1.474000 3.357000 45.190000 0.008146 0.032450 0.042050 0.014710 0.023480 0.004558 18.790000 29.720000 125.400000 1084.000000 0.146000 0.339100 0.382900 0.161400 0.317900 0.092080
max 1.000000 28.110000 39.280000 188.500000 2501.000000 0.163400 0.345400 0.426800 0.201200 0.304000 0.097440 2.873000 4.885000 21.980000 542.200000 0.031130 0.135400 0.396000 0.052790 0.078950 0.029840 36.040000 49.540000 251.200000 4254.000000 0.222600 1.058000 1.252000 0.291000 0.663800 0.207500
**
**
1
2
3
features_mean = list(data.columns[2:12]) # 平均值数据
features_se = list(data.columns[12:22]) # 标准差数据
features_worst = list(data.columns[22:32]) # 最大值数据
1
data.drop('id',axis=1,inplace=True)
1
2
3
# 更改内容
data.diagnosis = data.diagnosis.map({'M':1,'B':0}) # map映射
data.head()

diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se area_se smoothness_se compactness_se concavity_se concave points_se symmetry_se fractal_dimension_se radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst
0 1 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.3001 0.14710 0.2419 0.07871 1.0950 0.9053 8.589 153.40 0.006399 0.04904 0.05373 0.01587 0.03003 0.006193 25.38 17.33 184.60 2019.0 0.1622 0.6656 0.7119 0.2654 0.4601 0.11890
1 1 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.0869 0.07017 0.1812 0.05667 0.5435 0.7339 3.398 74.08 0.005225 0.01308 0.01860 0.01340 0.01389 0.003532 24.99 23.41 158.80 1956.0 0.1238 0.1866 0.2416 0.1860 0.2750 0.08902
2 1 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.1974 0.12790 0.2069 0.05999 0.7456 0.7869 4.585 94.03 0.006150 0.04006 0.03832 0.02058 0.02250 0.004571 23.57 25.53 152.50 1709.0 0.1444 0.4245 0.4504 0.2430 0.3613 0.08758
3 1 11.42 20.38 77.58 386.1 0.14250 0.28390 0.2414 0.10520 0.2597 0.09744 0.4956 1.1560 3.445 27.23 0.009110 0.07458 0.05661 0.01867 0.05963 0.009208 14.91 26.50 98.87 567.7 0.2098 0.8663 0.6869 0.2575 0.6638 0.17300
4 1 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.1980 0.10430 0.1809 0.05883 0.7572 0.7813 5.438 94.44 0.011490 0.02461 0.05688 0.01885 0.01756 0.005115 22.54 16.67 152.20 1575.0 0.1374 0.2050 0.4000 0.1625 0.2364 0.07678
**
**
1
2
3
import seaborn as sns
import matplotlib.pyplot as plt
plt =sns.countplot(data.diagnosis,label='Count')#条形图

png

1
2
3
4
# 研究相关性
corr = data[features_mean].corr()

sns.heatmap(corr,annot=True)
<matplotlib.axes._subplots.AxesSubplot at 0x27be75f2278>

png

1
features_remain = ['radius_mean','texture_mean', 'smoothness_mean','compactness_mean','symmetry_mean', 'fractal_dimension_mean']
1
2
3
4
5
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

1
2
3
4
5
train,test = train_test_split(data,test_size=0.3)
train_X = train[features_remain]
train_y = train['diagnosis']
test_X = test[features_remain]
test_y = test['diagnosis']
1
2
3
4
ss = StandardScaler()
train_X = ss.fit_transform(train_X)
test_X = ss.fit_transform(test_X)

1
2
3
4
5
6
model = svm.SVC()
# 训练数据
model.fit(train_X,train_y)
# 6.模型评估 # 分类阶段
prediction = model.predict(test_X)
print('准确率:',metrics.accuracy_score(prediction,test_y))
准确率: 0.935672514619883
顺便评个分吧!👇

SVM算法分析与人脸识别案例

Posted on 2021-07-30 | In 机器学习 , 算法

机器学习算法支持向量机SVM

SVM

image-20210730223217944

SVM:

  • 线性分类器
  • 在深度神经网络之前特别火热

SV:

  • 支持向量(Suport Vector)
    • 处于分类边界的点

image-20210730223235968

怎么求SV:

  • 代入等于-1/1
  • 其他的值大于1就在支持向量之上(标签为1)
  • 其他的值小于-1就在支持向量之下(标签为-1)

带松弛变量的SVM数学模型

  • C越大越满足原始定义,根据边界绝对分割(斜线)
  • C越小,越满足松弛的定义,(类似直线)

image-20210730223516650

image-20210730223546723

image-20210730223602574

SVM异常值处理

image-20210730223405670

image-20210730223414208

image-20210730223425957

逻辑回归和SVM区别

逻辑回归

  • 线性分类

  • 考虑所有的点

SVM:

  • 线性分类
  • 考虑类与类之间边界的点
  • 对歧义的值不会太敏感

代码理解SVM

1
2
3
4
5
6
7
8
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

# use seaborn plotting defaults
import seaborn as sns
sns.set()

创建模拟数据集

1
2
3
4
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=50, centers=2,
random_state=0, cluster_std=0.60)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn');

png

可以有多种方法分类

1
2
3
4
5
6
7
xfit = np.linspace(-1, 3.5)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn')

for m, b in [(1, 0.65), (0.5, 1.6), (-0.2, 2.9)]:
plt.plot(xfit, m * xfit + b, '-k')

plt.xlim(-1, 3.5);

png

SVM: 假想每一条分割线是有宽度的

1
2
3
4
5
6
7
8
9
10
xfit = np.linspace(-1, 3.5)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn')

for m, b, d in [(1, 0.65, 0.33), (0.5, 1.6, 0.55), (-0.2, 2.9, 0.2)]:
yfit = m * xfit + b
plt.plot(xfit, yfit, '-k')
plt.fill_between(xfit, yfit - d, yfit + d, edgecolor='none',
color='#AAAAAA', alpha=0.4)

plt.xlim(-1, 3.5);

png

在SVM的框架下, 认为最宽的线为最优的分割线

训练SVM

使用线性SVM和比较大的 C

1
2
3
from sklearn.svm import SVC # "Support vector classifier"
model = SVC(kernel='linear', C=1E10)
model.fit(X, y)
SVC(C=10000000000.0, kernel='linear')

创建一个显示SVM分割线的函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
def plot_svc_decision_function(model, ax=None, plot_support=True):
"""Plot the decision function for a 2D SVC"""
if ax is None:
ax = plt.gca()
xlim = ax.get_xlim()
ylim = ax.get_ylim()

# create grid to evaluate model
x = np.linspace(xlim[0], xlim[1], 30)
y = np.linspace(ylim[0], ylim[1], 30)
Y, X = np.meshgrid(y, x)
xy = np.vstack([X.ravel(), Y.ravel()]).T
P = model.decision_function(xy).reshape(X.shape)

# plot decision boundary and margins
ax.contour(X, Y, P, colors='k',
levels=[-1, 0, 1], alpha=0.5,
linestyles=['--', '-', '--'])

# plot support vectors
if plot_support:
ax.scatter(model.support_vectors_[:, 0],
model.support_vectors_[:, 1],
s=300, linewidth=1, facecolors='none');
ax.set_xlim(xlim)
ax.set_ylim(ylim)
1
2
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn')
plot_svc_decision_function(model);

png

1
model.support_vectors_
array([[0.44359863, 3.11530945],
       [2.33812285, 3.43116792],
       [2.06156753, 1.96918596]])

非支持向量的数据, 对分割线没有影响

只有支持向量会影响分割线, 如果我们添加一些非支持向量的数据, 对分割线没有影响

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
def plot_svm(N=10, ax=None):
X, y = make_blobs(n_samples=200, centers=2,
random_state=0, cluster_std=0.60)
X = X[:N]
y = y[:N]
model = SVC(kernel='linear', C=1E10)
model.fit(X, y)

ax = ax or plt.gca()
ax.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn')
ax.set_xlim(-1, 4)
ax.set_ylim(-1, 6)
plot_svc_decision_function(model, ax)

fig, ax = plt.subplots(1, 2, figsize=(16, 6))
fig.subplots_adjust(left=0.0625, right=0.95, wspace=0.1)
for axi, N in zip(ax, [60, 120]):
plot_svm(N, axi)
axi.set_title('N = {0}'.format(N))

output_16_0

二. 使用SVM进行人脸识别

1
2
3
4
from sklearn.datasets import fetch_lfw_people
faces = fetch_lfw_people(min_faces_per_person=60)
print(faces.target_names)
print(faces.images.shape)
['Ariel Sharon' 'Colin Powell' 'Donald Rumsfeld' 'George W Bush'
 'Gerhard Schroeder' 'Hugo Chavez' 'Junichiro Koizumi' 'Tony Blair']
(1348, 62, 47)
1
2
3
4
5
fig, ax = plt.subplots(3, 5)
for i, axi in enumerate(ax.flat):
axi.imshow(faces.images[i], cmap='bone')
axi.set(xticks=[], yticks=[],
xlabel=faces.target_names[faces.target[i]])

png

每一幅图的尺寸为 [62×47] , 大约 3000 个像素值

我们可以将整个图像展平为一个长度为3000左右的一维向量, 然后使用这个向量做为特征. 通常更有效的方法是通过预处理提取图像最重要的特征. 一个重要的特征提取方法是PCA(主成分分析), 可以将一副图像转换为一个长度为更短的(150)向量.

1
2
3
4
5
6
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
pca = PCA(n_components=150, whiten=True, random_state=42)
svc = SVC(kernel='linear', class_weight='balanced')
model = make_pipeline(pca, svc)

将数据分为训练和测试数据集

1
2
3
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(faces.data, faces.target,
random_state=42)

调参:通过交叉验证寻找最佳的 C (控制间隔的大小)

1
2
3
4
5
6
7
from sklearn.model_selection import GridSearchCV

param_grid = {'svc__C': [1, 5, 10, 50]}
grid = GridSearchCV(model, param_grid)

%time grid.fit(Xtrain, ytrain)
print(grid.best_params_)
Wall time: 10 s
{'svc__C': 1}
1
2
model = grid.best_estimator_
yfit = model.predict(Xtest)

使用训练好的SVM做预测

1
2
3
4
5
6
7
fig, ax = plt.subplots(4, 6)
for i, axi in enumerate(ax.flat):
axi.imshow(Xtest[i].reshape(62, 47), cmap='bone')
axi.set(xticks=[], yticks=[])
axi.set_ylabel(faces.target_names[yfit[i]].split()[-1],
color='black' if yfit[i] == ytest[i] else 'red')
fig.suptitle('Predicted Names; Incorrect Labels in Red', size=14);

png

生成性能报告

1
2
3
from sklearn.metrics import classification_report
print(classification_report(ytest, yfit,
target_names=faces.target_names))
                   precision    recall  f1-score   support

     Ariel Sharon       0.62      0.67      0.65        15
     Colin Powell       0.74      0.84      0.79        68
  Donald Rumsfeld       0.74      0.81      0.77        31
    George W Bush       0.85      0.79      0.82       126
Gerhard Schroeder       0.75      0.78      0.77        23
      Hugo Chavez       0.93      0.70      0.80        20
Junichiro Koizumi       0.92      0.92      0.92        12
       Tony Blair       0.81      0.83      0.82        42

         accuracy                           0.80       337
        macro avg       0.80      0.79      0.79       337
     weighted avg       0.80      0.80      0.80       337

混淆矩阵

1
2
3
4
5
6
7
from sklearn.metrics import confusion_matrix
mat = confusion_matrix(ytest, yfit)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False,
xticklabels=faces.target_names,
yticklabels=faces.target_names)
plt.xlabel('true label')
plt.ylabel('predicted label');

png

SVM总结

优点

  • 模型只需要保存支持向量, 模型占用内存少, 预测快.
  • 分类只取决于支持向量, 适合数据的维度高的情况, 例如DNA数据

缺点

  • 训练的时间复杂度为 $\mathcal{O}[N^3]$ 或者至少 $\mathcal{O}[N^2]$, 当数据量巨大时候不合适使用.
  • 需要做调参 $C$ 当数据量大时非常耗时间.
顺便评个分吧!👇

朴素贝叶斯补充与情感分析案例

Posted on 2021-07-28 | In 机器学习 , 算法

机器学习算法朴素贝叶斯

朴素贝叶斯

image-20210728232250767

单词表示:

把一组词典用向量表示

文本分析

单词表示

这个单词在词典出现的位置就标为1,其他为0

image-20210728232438878

句子的表示

将词典与句子比较

每个词在句子里面出现就标记为1

image-20210728232502673

count_vector :

把每个句子中的词语出现的次数作为向量的值

数一下词典里面词语出现的次数

TF-IDF表示

使用Tf-idf -构建向量

并不是出现的越多越重要

并不是出现的越少越不重要

公式:

image-20210728232746607

解释:

tf :这个句子中词语(1,2….)出现的次数

idf:

N:文档的总数

N(W):w出现的次数

加log为了平滑,减小数值上的差距

很多文档都出现就不那么重要

就几个文档出现,说明这个词语很重要

具体计算:

image-20210728232922340

情感分析

1
2
3
4
5
6

from matplotlib import pyplot as plt
import jieba # 分词
import re # 正则
from sklearn.feature_extraction.text import TfidfVectorizer #tf-idf
import numpy as np

读取数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def read_data(path, is_pos=None):
"""
给定文件的路径,读取文件
path: path to the data
is_pos: 是否数据是postive samples.
return: (list of review texts, list of labels)
"""
reviews, labels = [], []
with open(path, 'r',encoding='utf-8') as file:
review_start = False
review_text = []
for line in file:
line = line.strip()
if not line: continue
if not review_start and line.startswith("<review"):
review_start = True
if "label" in line:
labels.append(int(line.split('"')[-2]))
continue
if review_start and line == "</review>":
review_start = False
reviews.append(" ".join(review_text))
review_text = []
continue
if review_start:
review_text.append(line)
if is_pos:
labels = [1]*len(reviews)
elif not is_pos is None:
labels = [0]*len(reviews)
return reviews, labels


def process_file():
"""
读取训练数据和测试数据,并对它们做一些预处理
"""
train_pos_file = "data_sentiment/train.positive.txt" # 训练数据
train_neg_file = "data_sentiment/train.negative.txt" # 训练数据
test_comb_file = "data_sentiment/test.combined.txt" # 测试数据

# 读取文件部分,把具体的内容写入到变量里面
train_pos_cmts, train_pos_lbs = read_data(train_pos_file, True)
train_neg_cmts, train_neg_lbs = read_data(train_neg_file, False)
train_comments = train_pos_cmts + train_neg_cmts
train_labels = train_pos_lbs + train_neg_lbs
test_comments, test_labels = read_data(test_comb_file)
return train_comments, train_labels, test_comments, test_labels
train_comments, train_labels, test_comments, test_labels = process_file()
1
2
3
4
# 训练数据和测试数据大小
print (len(train_comments), len(test_comments))

print (train_comments[1], train_labels[1])
8064 2500
手感超好,而且黑色相比白色在转得时候不容易眼花,找童年的记忆啦。 1

数据预处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
def load_stopwords(path):
"""
从外部文件中导入停用词
"""
stopwords = set()
with open(path, 'r',encoding='utf-8') as in_file:
for line in in_file:
stopwords.add(line.strip())
return stopwords


def clean_non_chinese_symbols(text):
"""
处理非中文字符
"""
text = re.sub('[!!]+', "!", text)
text = re.sub('[??]+', "?", text)
text = re.sub("[a-zA-Z#$%&\'()*+,-./:;:<=>@,。★、…【】《》“”‘’[\\]^_`{|}~]+", " UNK ", text) #用unk代替特殊的字符
return re.sub("\s+", " ", text) # 处理空格

def clean_numbers(text):
"""
处理数字符号 128 190 NUM
"""
return re.sub("\d+", ' NUM ', text)

def preprocess_text(text, stopwords):
"""
文本的预处理过程
"""
text = clean_non_chinese_symbols(text)
text = clean_numbers(text)
text = " ".join([term for term in jieba.cut(text) if term and not term in stopwords])
return text
1
2
path_stopwords = "./data_sentiment/stopwords.txt"
stopwords = load_stopwords(path_stopwords)
1
2
3
4
5
6
7
8
9
10
11
12
13
# 对于train_comments, test_comments进行字符串的处理,几个考虑的点:
# 1. 停用词过滤
# 2. 去掉特殊符号
# 3. 去掉数字(比如价格..)
# 4. ...
# 需要注意的点是,由于评论数据本身很短,如果去掉的太多,很可能字符串长度变成0
# 预处理部部分,可以自行选择合适的方案,只要注释就可以。

train_comments_new = [preprocess_text(comment, stopwords) for comment in train_comments]
test_comments_new = [preprocess_text(comment, stopwords) for comment in test_comments]

print (train_comments_new[0], test_comments_new[0])

发短信 特别 不 方便 ! 背后 屏幕 很大 起来 不 舒服   UNK   手触 屏 ! 切换 屏幕 很 麻烦 ! 终于 找到 同道中人 初中   UNK   已经 喜欢 上   UNK   同学 都 鄙夷 眼光 看   UNK   人为   UNK   样子 古怪 说 " 丑 " 当场 气晕 现在 同道中人   UNK   好开心 !   UNK   !   UNK  
1
2
3
4
5
6
7
8
9
#   利用tf-idf从文本中提取特征,写到数组里面. 
# 参考:https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
tfidf = TfidfVectorizer()
X_train = tfidf.fit_transform(train_comments_new) # 训练数据的特征
y_train = train_labels # 训练数据的label
X_test = tfidf.transform(test_comments_new) # 测试数据的特征
y_test = test_labels# 测试数据的label

print (np.shape(X_train), np.shape(X_test), np.shape(y_train), np.shape(y_test))
(8064, 23101) (2500, 23101) (8064,) (2500,)

贝叶斯训练

1
# 8064个样本 词库共有23101
1
2
3
4
5
6
7
8
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

clf = MultinomialNB()
# 利用朴素贝叶斯做训练
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("朴素贝叶斯的准确率: ", accuracy_score(y_test, y_pred))
朴素贝叶斯的准确率:  0.6368

KNN分类器训练

1
2
3
4
5
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("KNN算法准确率: ", accuracy_score(y_test, y_pred))
KNN算法准确率:  0.524

维度过高,不适合用KNN

逻辑回归训练

1
2
3
4
5
6
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(solver='liblinear')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("逻辑回归的准确率: ", accuracy_score(y_test, y_pred))
逻辑回归的准确率:  0.7136

逻辑回归和朴素贝叶斯进行分类的区别

逻辑回归(条件):

  • 根据条件区别来判别

朴素贝叶斯 (联合):

  • 根据总体特征来判断
  • 可以生成相关的数据

比如:

区别人与狗

逻辑回归:找到人与狗的区别,发现人两条腿走路,狗4条腿

朴素贝叶斯:记住人和狗的所有特征

当给狗的测试样本

逻辑回归:

  • 四条腿走路-狗(人是两条腿走路)

朴素贝叶斯:

  • 是狗头,鼻子嗅觉很强,毛很长,四条腿….. - 是狗

相关课后数学知识

当特征为实数的时候

朴素贝叶斯使用的其实是高斯朴素贝叶斯模型

为什么叫“朴素

因为为了简化,使用了条件独立的方法

即进行了以下转换

image-20210728233725032

朴素贝叶斯的最大似然

最大似然估计推导

生成模型和判别模型

具体看上面逻辑回归(判别模型)和朴素贝叶斯(生成模型)进行分类的区别

顺便评个分吧!👇

朴素贝叶斯与垃圾邮件分类案例

Posted on 2021-07-28 | In 机器学习 , 算法

机器学习算法朴素贝叶斯

朴素贝叶斯

朴素贝叶斯

  • 最适合简单的文本分析算法

公式推导

最大似然估计推导

贝叶斯定理:

image-20210728230651797

image-20210728230906004

朴素贝叶斯核心思想

  • 根据某些词频大小的特征来判断属于什么类别
  • 似然/条件独立…..

垃圾邮件分类

步骤

  1. 归纳词库,确定垃圾词语
  2. 计算垃圾词语在正常邮件和垃圾邮件的概率以及先验概率
  3. 对需要分类的邮件进行分词
  4. 根据P(正常|邮件内容) 与 P(垃圾|邮件内容)的比较来进行分类(用到条件独立进行简化)

​ 分别计算出邮件中的单词在正常邮件和垃圾邮件中的概率:

​ P(垃圾|邮件内容) = 一封邮件为垃圾的概率

​ P(正常|邮件内容) = 一封邮件为正常的概率

​ 如果 P(垃圾|邮件内容) > P(正常|邮件内容),则推测此邮件为垃圾邮件;如果 P(垃圾|邮件内容) < P(正常|邮件内容),则推测此邮件为正常邮件。

先验概率

  • 垃圾邮件占总邮件的比例
  • 正常邮件占邮件的比例

处理某个P为0的情况:

  • 使用平滑的方法(分子+1,分母加V)
  • V是词库的大小
  • 也就是把0极小化,而让他不能为0

利用贝叶斯公式进行推导过程图:

image-20210728231519251

具体计算过程:

image-20210728231336309

归纳词库

= .unique()

垃圾邮件分类

代码实现

1
import pandas as pd
1
2
3
data = pd.read_csv('spam.csv',encoding='latin')
data.head()
# ham 正常 spam垃圾邮件

v1 v2 Unnamed: 2 Unnamed: 3 Unnamed: 4
0 ham Go until jurong point, crazy.. Available only ... NaN NaN NaN
1 ham Ok lar... Joking wif u oni... NaN NaN NaN
2 spam Free entry in 2 a wkly comp to win FA Cup fina... NaN NaN NaN
3 ham U dun say so early hor... U c already then say... NaN NaN NaN
4 ham Nah I don't think he goes to usf, he lives aro... NaN NaN NaN
**
**
1
2
data.rename(columns={"v1":"Label","v2":'Text'},inplace=True) #重命名
data.head()

Label Text Unnamed: 2 Unnamed: 3 Unnamed: 4
0 ham Go until jurong point, crazy.. Available only ... NaN NaN NaN
1 ham Ok lar... Joking wif u oni... NaN NaN NaN
2 spam Free entry in 2 a wkly comp to win FA Cup fina... NaN NaN NaN
3 ham U dun say so early hor... U c already then say... NaN NaN NaN
4 ham Nah I don't think he goes to usf, he lives aro... NaN NaN NaN
**
**
1
2
# 把ham改为0 ,spam改为1
data['num_label'] = data['Label'].map({"ham":0,"spam":1})
1
data.head()

Label Text Unnamed: 2 Unnamed: 3 Unnamed: 4 num_label
0 ham Go until jurong point, crazy.. Available only ... NaN NaN NaN 0
1 ham Ok lar... Joking wif u oni... NaN NaN NaN 0
2 spam Free entry in 2 a wkly comp to win FA Cup fina... NaN NaN NaN 1
3 ham U dun say so early hor... U c already then say... NaN NaN NaN 0
4 ham Nah I don't think he goes to usf, he lives aro... NaN NaN NaN 0
**
**
1
2
3
print("正常邮件的数目为:{}".format(len(data[data['num_label'] == 0])))
print("垃圾邮件的数目为:{}".format(len(data[data['num_label'] == 1])))
print('总共邮件有{}'.format(len(data[data['num_label'] == 1])+len(data[data['num_label'] == 0])))
正常邮件的数目为:4825
垃圾邮件的数目为:747
总共邮件有5572

7:1 还好

1
data.loc[1,'Text'] # [1.'Text'] 取第2行的数据中的Text列
'Ok lar... Joking wif u oni...'
1
2
3
# 统计文本长度
text_lengths = [len(data.loc[i,'Text']) for i in range(len(data))]
print('最短的邮件内容长度为{}'.format(min(text_lengths)))
最短的邮件内容长度为2
1
import matplotlib.pyplot as plt
1
2
3
plt.hist(text_lengths,50,facecolor='blue',alpha=0.5) # bins =100 分为100个格子
plt.xlim([0,200]) # 重新设置x长度
plt.show()

png

1
2
3
4
5
6
7
8
9
10
11
12
13
# 需要把文章转换成向量的形式
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer # 能把某个词语转换成向量的形式

# stopset = set(stopwords.words("english"))
# 解决停用词可选可不选(在文本中没有什么作用的词语:他(he) 她(she) 一个(an/a),这个(this),那个(that).....)


# 构建文本向量
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data.Text)
y = data.num_label
print(X) #稀疏矩阵
  (0, 8267)    1
  (0, 1069)    1
  (0, 3594)    1
  (0, 7645)    1
  (0, 2048)    1
  (0, 1749)    1
  (0, 4476)    1
  (0, 8489)    1
  (0, 3634)    1
  (0, 1751)    1
  (0, 4087)    1
  (0, 5537)    1
  (0, 1303)    1
  (0, 2327)    1
  (0, 5920)    1
  (0, 4350)    1
  (0, 8030)    1
  (0, 3550)    1
  (1, 5533)    1
  (1, 8392)    1
  (1, 4318)    1
  (1, 4512)    1
  (1, 5504)    1
  (2, 77)    1
  (2, 1156)    1
  :    :
  (5570, 1786)    1
  (5570, 3470)    1
  (5570, 2892)    1
  (5570, 7049)    1
  (5570, 1778)    1
  (5570, 8065)    1
  (5570, 2592)    1
  (5570, 5334)    1
  (5570, 1438)    1
  (5570, 7627)    1
  (5570, 3308)    1
  (5570, 7039)    1
  (5570, 4615)    1
  (5570, 1084)    1
  (5570, 8313)    1
  (5570, 4218)    1
  (5570, 3781)    1
  (5570, 7756)    1
  (5570, 3358)    1
  (5570, 4087)    1
  (5571, 6505)    1
  (5571, 7885)    1
  (5571, 4225)    2
  (5571, 5244)    1
  (5571, 7756)    1
1
2
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=2021,test_size=0.2)
1
2
print('训练数据为{}'.format(X_train.shape[0]))
print('测试数据为{}'.format(X_test.shape[0]))
训练数据为4457
测试数据为1115
1
2
3
4
5
6
7
8
# 利用朴素贝叶斯做训练
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

clf = MultinomialNB(alpha = 1.0,fit_prior = True) # alpha =1.0 使用平滑的方法, fit_prior使用先验概率
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("模型拟合的准确率为{:.1f}%".format(accuracy_score(y_test,y_pred)*100))
模型拟合的结果为98.3%
1
2
3
# 打印混淆矩阵
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred,labels=[0,1]) # 以0 和1的方式展示它,两个不同的标签,使用展示的就是2*2
array([[958,  11],
       [  8, 138]], dtype=int64)

混淆矩阵可视化:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred,labels=[0,1]) #混淆矩阵

import matplotlib.pyplot as plt #导入作图库
plt.matshow(cm, cmap=plt.cm.cool) #画混淆矩阵图,配色风格使用cm.Greens,更多风格请参考官网。
plt.colorbar() #颜色标签

for x in range(len(cm)): #数据标签
for y in range(len(cm)):
plt.annotate(cm[x,y], xy=(x, y), horizontalalignment='center', verticalalignment='center')

plt.ylabel('True label') #坐标轴标签
plt.xlabel('Predicted label') #坐标轴标签
plt.show()

output_18_0

具体配色点击

查看模型的训练效果-混淆矩阵

通过混淆矩阵

  • 958+11 = 969 个标签为0的 但958个是正确的 11个错误的

  • 8+138 = 146 个标签为1的 但8个是分类错误的 138 个是分类正确的

  • 测试数据时候总共分类了969个标签为0的 146个标签为1的

  • 但0有958个正确了 11个错误了

  • 但1有 8个错误了 138个正确了

  • 主对角线是正确的

顺便评个分吧!👇

逻辑回归分析与预测客户是否开设存款账户实战

Posted on 2021-07-27 | In 机器学习 , 算法

机器学习算法逻辑回归

逻辑回归

  • 一般用于分类
  • 一般逻辑回归只能解决分类两种(是/否),解决二元分类模型
  • 但其实可以扩展为多元分类

损失函数是判断模型好不好:

  1. 有一个凸函数
  2. 有全局的最优解
  3. 求梯度

由于二元分类的值不是连续的,只有0/1,因此如果使用线性回归的话,对数据会很容易敏感

image-20210727211118260

因此我们需要引入逻辑回归对二元分类问题进行处理:

image-20210727211159865

相关知识点

image-20210727211212225

image-20210727211238821

逻辑回归预测银行客户是否会开设定期存款账户

逻辑回归要求预测值为 0 或者 1, 自变量特征值应该彼此独立。

1
2
3
4
5
6
7
8
9
10
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
plt.rc("font", size=14)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

该数据集来自UCI机器学习库(http://archive.ics.uci.edu/ml/datasets/Bank+Marketing),葡萄牙银行的电话营销。 分类目标是预测客户是否会开设到定期存款账户(预测值y)。

1
2
data = pd.read_csv('banking.csv') 
data.head()

age job marital education default housing loan contact month day_of_week ... campaign pdays previous poutcome emp_var_rate cons_price_idx cons_conf_idx euribor3m nr_employed y
0 44 blue-collar married basic.4y unknown yes no cellular aug thu ... 1 999 0 nonexistent 1.4 93.444 -36.1 4.963 5228.1 0
1 53 technician married unknown no no no cellular nov fri ... 1 999 0 nonexistent -0.1 93.200 -42.0 4.021 5195.8 0
2 28 management single university.degree no yes no cellular jun thu ... 3 6 2 success -1.7 94.055 -39.8 0.729 4991.6 1
3 39 services married high.school no no no cellular apr fri ... 2 999 0 nonexistent -1.8 93.075 -47.1 1.405 5099.1 0
4 55 retired married basic.4y no yes no cellular aug fri ... 1 3 1 success -2.9 92.201 -31.4 0.869 5076.2 1

5 rows × 21 columns

**
**
1
2
data.info()
print(data.shape)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
age               41188 non-null int64
job               41188 non-null object
marital           41188 non-null object
education         41188 non-null object
default           41188 non-null object
housing           41188 non-null object
loan              41188 non-null object
contact           41188 non-null object
month             41188 non-null object
day_of_week       41188 non-null object
duration          41188 non-null int64
campaign          41188 non-null int64
pdays             41188 non-null int64
previous          41188 non-null int64
poutcome          41188 non-null object
emp_var_rate      41188 non-null float64
cons_price_idx    41188 non-null float64
cons_conf_idx     41188 non-null float64
euribor3m         41188 non-null float64
nr_employed       41188 non-null float64
y                 41188 non-null int64
dtypes: float64(5), int64(6), object(10)
memory usage: 6.6+ MB
(41188, 21)
1
2
data.dropna(inplace=True)
print(data.columns)
Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp_var_rate', 'cons_price_idx',
       'cons_conf_idx', 'euribor3m', 'nr_employed', 'y'],
      dtype='object')

特征的意义:

bank client data:

  • 1 - age (numeric)
  • 2 - job : type of job (categorical: ‘admin.’,’blue-collar’,’entrepreneur’,’housemaid’,’management’,’retired’,’self-employed’,’services’,’student’,’technician’,’unemployed’,’unknown’)
  • 3 - marital : marital status (categorical: ‘divorced’,’married’,’single’,’unknown’; note: ‘divorced’ means divorced or widowed)
  • 4 - education (categorical: ‘basic.4y’,’basic.6y’,’basic.9y’,’high.school’,’illiterate’,’professional.course’,’university.degree’,’unknown’)
  • 5 - default: has credit in default? (categorical: ‘no’,’yes’,’unknown’)
  • 6 - housing: has housing loan? (categorical: ‘no’,’yes’,’unknown’)
  • 7 - loan: has personal loan? (categorical: ‘no’,’yes’,’unknown’)

related with the last contact of the current campaign:

  • 8 - contact: contact communication type (categorical: ‘cellular’,’telephone’)
  • 9 - month: last contact month of year (categorical: ‘jan’, ‘feb’, ‘mar’, …, ‘nov’, ‘dec’)
  • 10 - day_of_week: last contact day of the week (categorical: ‘mon’,’tue’,’wed’,’thu’,’fri’)
  • 11 - duration: last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y=’no’). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.

other attributes:

  • 12 - campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
  • 13 - pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; * 999 means client was not previously contacted)
  • 14 - previous: number of contacts performed before this campaign and for this client (numeric)
  • 15 - poutcome: outcome of the previous marketing campaign (categorical: ‘failure’,’nonexistent’,’success’)

social and economic context attributes

  • 16 - emp.var.rate: employment variation rate - quarterly indicator (numeric)
  • 17 - cons.price.idx: consumer price index - monthly indicator (numeric)
  • 18 - cons.conf.idx: consumer confidence index - monthly indicator (numeric)
  • 19 - euribor3m: euribor 3 month rate - daily indicator (numeric)
  • 20 - nr.employed: number of employees - quarterly indicator (numeric)

Output variable (desired target):

  • 21 - y - has the client subscribed a term deposit? (binary: ‘yes’,’no’)

其中的y列,0代表推销不成功,1代表推销成功

特征工程

1
data['education'].unique()
array(['basic.4y', 'unknown', 'university.degree', 'high.school',
       'basic.9y', 'professional.course', 'basic.6y', 'illiterate'],
      dtype=object)
1
2
3
4
5
# 觉得基础教育4-9年没区别,所以放在一组
data['education'] = np.where(data['education']=='basic.4y','Basic',data['education'])
data['education'] = np.where(data['education']=='basic.6y','Basic',data['education'])
data['education'] = np.where(data['education']=='basic.9y','Basic',data['education'])
data['education'].unique()
array(['Basic', 'unknown', 'university.degree', 'high.school',
       'professional.course', 'illiterate'], dtype=object)
1
data['y'].value_counts()
0    36548
1     4640
Name: y, dtype: int64
1
2
3
sns.countplot(x='y',data=data,palette='hls')
plt.show()
#数据特别不平衡,差别太大比例是7:1

png

1
2
3
4
5
6
7
#尽量0和1把比例控制在1:1
count_no_sub = len(data[data['y']==0])
count_sub = len(data[data['y']==1])
pct_of_no_sub = count_no_sub / (count_no_sub+count_sub)
print("未开户的百分比: %.2f%%"%(pct_of_no_sub*100))
pct_of_sub = count_sub /(count_no_sub+count_sub)
print("开户的百分比:%.2f%%"%(pct_of_sub*100))
未开户的百分比: 88.73%
开户的百分比:11.27%
1
data.groupby('y').mean()

age duration campaign pdays previous emp_var_rate cons_price_idx cons_conf_idx euribor3m nr_employed
y
0 39.911185 220.844807 2.633085 984.113878 0.132374 0.248875 93.603757 -40.593097 3.811491 5176.166600
1 40.913147 553.191164 2.051724 792.035560 0.492672 -1.233448 93.354386 -39.789784 2.123135 5095.115991
**
**

数据可视化分析

购买定期存款的客户的平均年龄高于未购买定期存款的客户的平均年龄。

购买定期存款的客户的 pdays(自上次联系客户以来的日子)较低。 pdays越低,最后一次通话的记忆越好,因此销售的机会就越大。

令人惊讶的是,购买定期存款的客户的销售通话次数较低。

我们可以计算其他特征值(如教育和婚姻状况)的分布,以更详细地了解我们的数据。

1
data.groupby('job').mean()

age duration campaign pdays previous emp_var_rate cons_price_idx cons_conf_idx euribor3m nr_employed y
job
admin. 38.187296 254.312128 2.623489 954.319229 0.189023 0.015563 93.534054 -40.245433 3.550274 5164.125350 0.129726
blue-collar 39.555760 264.542360 2.558461 985.160363 0.122542 0.248995 93.656656 -41.375816 3.771996 5175.615150 0.068943
entrepreneur 41.723214 263.267857 2.535714 981.267170 0.138736 0.158723 93.605372 -41.283654 3.791120 5176.313530 0.085165
housemaid 45.500000 250.454717 2.639623 960.579245 0.137736 0.433396 93.676576 -39.495283 4.009645 5179.529623 0.100000
management 42.362859 257.058140 2.476060 962.647059 0.185021 -0.012688 93.522755 -40.489466 3.611316 5166.650513 0.112175
retired 62.027326 273.712209 2.476744 897.936047 0.327326 -0.698314 93.430786 -38.573081 2.770066 5122.262151 0.252326
self-employed 39.949331 264.142153 2.660802 976.621393 0.143561 0.094159 93.559982 -40.488107 3.689376 5170.674384 0.104856
services 37.926430 258.398085 2.587805 979.974049 0.154951 0.175359 93.634659 -41.290048 3.699187 5171.600126 0.081381
student 25.894857 283.683429 2.104000 840.217143 0.524571 -1.408000 93.331613 -40.187543 1.884224 5085.939086 0.314286
technician 38.507638 250.232241 2.577339 964.408127 0.153789 0.274566 93.561471 -39.927569 3.820401 5175.648391 0.108260
unemployed 39.733728 249.451677 2.564103 935.316568 0.199211 -0.111736 93.563781 -40.007594 3.466583 5157.156509 0.142012
unknown 45.563636 239.675758 2.648485 938.727273 0.154545 0.357879 93.718942 -38.797879 3.949033 5172.931818 0.112121
**
**
1
data.groupby('marital').mean()

age duration campaign pdays previous emp_var_rate cons_price_idx cons_conf_idx euribor3m nr_employed y
marital
divorced 44.899393 253.790330 2.61340 968.639853 0.168690 0.163985 93.606563 -40.707069 3.715603 5170.878643 0.103209
married 42.307165 257.438623 2.57281 967.247673 0.155608 0.183625 93.597367 -40.270659 3.745832 5171.848772 0.101573
single 33.158714 261.524378 2.53380 949.909578 0.211359 -0.167989 93.517300 -40.918698 3.317447 5155.199265 0.140041
unknown 40.275000 312.725000 3.18750 937.100000 0.275000 -0.221250 93.471250 -40.820000 3.313038 5157.393750 0.150000
**
**
1
data.groupby('education').mean()

age duration campaign pdays previous emp_var_rate cons_price_idx cons_conf_idx euribor3m nr_employed y
education
Basic 42.163910 263.043874 2.559498 974.877967 0.141053 0.191329 93.639933 -40.927595 3.729654 5172.014113 0.087029
high.school 37.998213 260.886810 2.568576 964.358382 0.185917 0.032937 93.584857 -40.940641 3.556157 5164.994735 0.108355
illiterate 48.500000 276.777778 2.277778 943.833333 0.111111 -0.133333 93.317333 -39.950000 3.516556 5171.777778 0.222222
professional.course 40.080107 252.533855 2.586115 960.765974 0.163075 0.173012 93.569864 -40.124108 3.710457 5170.155979 0.113485
university.degree 38.879191 253.223373 2.563527 951.807692 0.192390 -0.028090 93.493466 -39.975805 3.529663 5163.226298 0.137245
unknown 43.481225 262.390526 2.596187 942.830734 0.226459 0.059099 93.658615 -39.877816 3.571098 5159.549509 0.145003
**
**
1
2
3
4
5
6
7
%matplotlib inline
table=pd.crosstab(data.job,data.y)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of Job title vs Purchase')
plt.xlabel('Job')
plt.ylabel('Proportion of Purchase')
plt.savefig('purchase_vs_job')

png

具有不同职位的人购买存款的频率不一样。 因此,职称可以是良好的预测因素。

1
2
3
4
5
6
table=pd.crosstab(data.marital,data.y)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of Marital Status vs Purchase')
plt.xlabel('Marital Status')
plt.ylabel('Proportion of Customers')
plt.savefig('mariral_vs_pur_stack')

png

婚姻状况似乎不是好的预测因素。

1
2
3
4
5
6
table=pd.crosstab(data.education,data.y)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of Education vs Purchase')
plt.xlabel('Education')
plt.ylabel('Proportion of Customers')
plt.savefig('edu_vs_pur_stack')

png

教育似乎是结果变量的良好预测指标。

1
2
3
4
5
6
table=pd.crosstab(data.day_of_week,data.y)#.plot(kind='bar')
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of Day of Week vs Purchase')
plt.xlabel('Day of Week')
plt.ylabel('Proportion of Purchase')
plt.savefig('dow_vs_purchase')

png

一周工作时间不是预测结果的良好预测因素。

1
2
3
4
cat_vars=['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome']
for var in cat_vars:
cat_list = pd.get_dummies(data[var], prefix=var)
data=data.join(cat_list)
1
2
data_final=data.drop(cat_vars, axis=1)
data_final.columns.values
array(['age', 'duration', 'campaign', 'pdays', 'previous', 'emp_var_rate',
       'cons_price_idx', 'cons_conf_idx', 'euribor3m', 'nr_employed', 'y',
       'job_admin.', 'job_blue-collar', 'job_entrepreneur',
       'job_housemaid', 'job_management', 'job_retired',
       'job_self-employed', 'job_services', 'job_student',
       'job_technician', 'job_unemployed', 'job_unknown',
       'marital_divorced', 'marital_married', 'marital_single',
       'marital_unknown', 'education_Basic', 'education_high.school',
       'education_illiterate', 'education_professional.course',
       'education_university.degree', 'education_unknown', 'default_no',
       'default_unknown', 'default_yes', 'housing_no', 'housing_unknown',
       'housing_yes', 'loan_no', 'loan_unknown', 'loan_yes',
       'contact_cellular', 'contact_telephone', 'month_apr', 'month_aug',
       'month_dec', 'month_jul', 'month_jun', 'month_mar', 'month_may',
       'month_nov', 'month_oct', 'month_sep', 'day_of_week_fri',
       'day_of_week_mon', 'day_of_week_thu', 'day_of_week_tue',
       'day_of_week_wed', 'poutcome_failure', 'poutcome_nonexistent',
       'poutcome_success'], dtype=object)

……
了解数据的特征和关系

当训练数据不平衡的时候,我们可以采取的措施:

  • 把数据量小的扩充成与大的1:1的比例(SMOTE采样,推荐)
  • 把数据量大的删掉一部分,与小的保持1:1的比例(不推荐)

使用SMOTE进行过采样

创建我们的训练数据后,我将使用SMOTE算法(合成少数过采样技术)对已经开户的用户进行上采样。 在高层次上,SMOTE:

通过从次要类(已经开户的用户)创建合成样本而不是创建副本来工作。
随机选择一个k-最近邻居并使用它来创建一个类似但随机调整的新观察结果。

使用如下命令安装:
conda install -c conda-forge imbalanced-learn

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
X = data_final.loc[:, data_final.columns != 'y'] #把不为y的这几列拿出来 (y为判断列)
y = data_final.loc[:, data_final.columns == 'y'].values.ravel() #把为y的这几列拿出来

# 进行过采样
from imblearn.over_sampling import SMOTE
os = SMOTE(random_state=0) # 给一个随机数的种子
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
columns = X_train.columns
os_data_X,os_data_y=os.fit_resample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
os_data_y= pd.DataFrame(data=os_data_y,columns=['y'])
# we can Check the numbers of our data
print("过采样以后的数据量: ",len(os_data_X))
print("未开户的用户数量: ",len(os_data_y[os_data_y['y']==0]))
print("开户的用户数量: ",len(os_data_y[os_data_y['y']==1]))
print("未开户的用户数量的百分比: ",len(os_data_y[os_data_y['y']==0])/len(os_data_X))
print("开户的用户数量的百分比: ",len(os_data_y[os_data_y['y']==1])/len(os_data_X))
过采样以后的数据量:  51134
未开户的用户数量:  25567
开户的用户数量:  25567
未开户的用户数量的百分比:  0.5
开户的用户数量的百分比:  0.5

现在我们拥有完美平衡的数据! 您可能已经注意到我仅对训练数据进行了过采样

就算实际的数据就不是1:1,那么我们也需要用过采样保证数据1:1
(本来开户的就是小部分,但我们仍然需要弄成1:1)

逻辑回归训练

1
2
3
4
5
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
logreg = LogisticRegression()
logreg.fit(os_data_X, os_data_y.values.reshape(-1))
1
2
y_pred = logreg.predict(X_test) #测试数据
print("训练出来的准确率为:{:.2f}".format(logreg.score(X_test,y_test)))
训练出来的准确率为:0.89

模型评估

1
2
3
4
# metrics可以得到一个报告:准确度 召回率  值越大越好  support:测试数据里面有多少这个类别的
#评价分类模型效果
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred)) # 测试数据不需要进行过采样,
              precision    recall  f1-score   support

           0       0.96      0.92      0.94     10981
           1       0.50      0.67      0.58      1376

    accuracy                           0.89     12357
   macro avg       0.73      0.80      0.76     12357
weighted avg       0.91      0.89      0.90     12357

对0类别的判断有96%的准确率

对1类别的判断只有50%的准确率

但由于数据量不平衡,权重测试后的准确率为91%

补充看分类模型的效果方法-roc_auc_score

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

png

算蓝色曲线与X轴的面积,越大越好,越大效果越好

这个面积占1*1正方形的比例为0.8,所以准确率也可以当作80%

红色虚线就是随机乱猜-50%的概率

逻辑回归和线性回归的区别

  • 线性回归一般是用于预测连续的值,比如说年龄,发病率
  • 逻辑回归一般用于分类(一般是二元分类)
顺便评个分吧!👇

线性回归分析与股票价格预测实战

Posted on 2021-07-27 | In 机器学习 , 算法

机器学习算法线性回归

回归分析

线性回归(英语:linearregression)

  • 是利用线性回归方程的最小二乘函数对一个或多个自变量和因变量之间关系建模的方法

image-20210727185758930

线性回归的应用

image-20210727185826111

线性回归的数学定义

image-20210727185904264

image-20210727185926594

线性回归的解析解与推导解释

公式:

image-20210727185953662

解析解推导:

image-20210727190140480

image-20210727190222068

image-20210727190234286

实战案例-股票回归预测(时间序列操作)

1
2
3
4
5
6
7
# 你可以使用如下的方法下载某一个公司的股票交易历史
# 000001 为平安银行
# 如果你还没有安装, 可以使用 pip install tushare 安装tushare python包
import tushare as ts
df = ts.get_hist_data('000001')
print(df)
df.to_csv('000001.csv')

1
2
3
4
5
# 预测股票
import numpy as np # 数学计算
import pandas as pd # 数据处理, 读取 CSV 文件 (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from datetime import datetime as dt #做时间处理
1
2
df = pd.read_csv('000008.csv') #平安银行股票信息
df.head()

date open high close low volume price_change p_change ma5 ma10 ma20 v_ma5 v_ma10 v_ma20 turnover
0 2021-07-27 2.13 2.14 2.10 2.09 134664.52 -0.03 -1.41 2.150 2.162 2.176 131424.11 110258.76 106935.07 0.51
1 2021-07-26 2.17 2.18 2.13 2.11 223703.88 -0.04 -1.84 2.166 2.172 2.183 121426.95 106060.87 110351.05 0.84
2 2021-07-23 2.18 2.19 2.17 2.17 96939.13 -0.01 -0.46 2.174 2.178 2.186 95522.00 99129.56 102828.30 0.36
3 2021-07-22 2.17 2.19 2.18 2.17 95186.02 0.01 0.46 2.174 2.179 2.188 92234.78 101179.23 103448.75 0.36
4 2021-07-21 2.17 2.19 2.17 2.16 106627.00 -0.01 -0.46 2.172 2.177 2.190 93422.51 104647.68 104303.14 0.40
**
**
1
df.shape
(606, 15)
1
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 606 entries, 0 to 605
Data columns (total 15 columns):
date            606 non-null object
open            606 non-null float64
high            606 non-null float64
close           606 non-null float64
low             606 non-null float64
volume          606 non-null float64
price_change    606 non-null float64
p_change        606 non-null float64
ma5             606 non-null float64
ma10            606 non-null float64
ma20            606 non-null float64
v_ma5           606 non-null float64
v_ma10          606 non-null float64
v_ma20          606 non-null float64
turnover        606 non-null float64
dtypes: float64(14), object(1)
memory usage: 71.1+ KB

股票数据的特征

  • date:日期
  • open:开盘价
  • high:最高价
  • close:收盘价
  • low:最低价
  • volume:成交量
  • price_change:价格变动
  • p_change:涨跌幅
  • ma5:5日均价
  • ma10:10日均价
  • ma20:20日均价
  • v_ma5:5日均量
  • v_ma10:10日均量
  • v_ma20:20日均量
1
2
df['date'].head()
# 发现日期是字符串的类型,要把它转换成时间序列
0    2021-07-27
1    2021-07-26
2    2021-07-23
3    2021-07-22
4    2021-07-21
Name: date, dtype: object
1
2
df['date'] = pd.to_datetime(df['date'])
df['date'].head()
0   2021-07-27
1   2021-07-26
2   2021-07-23
3   2021-07-22
4   2021-07-21
Name: date, dtype: datetime64[ns]
1
2
df = df.set_index('date') #s设置时间序列为索引,不能成为预测值
df.head()

open high close low volume price_change p_change ma5 ma10 ma20 v_ma5 v_ma10 v_ma20 turnover
date
2021-07-27 2.13 2.14 2.10 2.09 134664.52 -0.03 -1.41 2.150 2.162 2.176 131424.11 110258.76 106935.07 0.51
2021-07-26 2.17 2.18 2.13 2.11 223703.88 -0.04 -1.84 2.166 2.172 2.183 121426.95 106060.87 110351.05 0.84
2021-07-23 2.18 2.19 2.17 2.17 96939.13 -0.01 -0.46 2.174 2.178 2.186 95522.00 99129.56 102828.30 0.36
2021-07-22 2.17 2.19 2.18 2.17 95186.02 0.01 0.46 2.174 2.179 2.188 92234.78 101179.23 103448.75 0.36
2021-07-21 2.17 2.19 2.17 2.16 106627.00 -0.01 -0.46 2.172 2.177 2.190 93422.51 104647.68 104303.14 0.40
**
**
1
2
df.sort_values(by='date',inplace=True,ascending=True)  #ascending时间升序排序
df.tail()

open high close low volume price_change p_change ma5 ma10 ma20 v_ma5 v_ma10 v_ma20 turnover
date
2021-07-21 2.17 2.19 2.17 2.16 106627.00 -0.01 -0.46 2.172 2.177 2.190 93422.51 104647.68 104303.14 0.40
2021-07-22 2.17 2.19 2.18 2.17 95186.02 0.01 0.46 2.174 2.179 2.188 92234.78 101179.23 103448.75 0.36
2021-07-23 2.18 2.19 2.17 2.17 96939.13 -0.01 -0.46 2.174 2.178 2.186 95522.00 99129.56 102828.30 0.36
2021-07-26 2.17 2.18 2.13 2.11 223703.88 -0.04 -1.84 2.166 2.172 2.183 121426.95 106060.87 110351.05 0.84
2021-07-27 2.13 2.14 2.10 2.09 134664.52 -0.03 -1.41 2.150 2.162 2.176 131424.11 110258.76 106935.07 0.51
**
**
1
2
3
# 检测是否有缺失值,并处理缺失值
df.dropna(axis=0,inplace=True)
df.isna().sum()
open            0
high            0
close           0
low             0
volume          0
price_change    0
p_change        0
ma5             0
ma10            0
ma20            0
v_ma5           0
v_ma10          0
v_ma20          0
turnover        0
dtype: int64

K线图

1
2
3
4
5
Min_date = df.index.min()
Max_date = df.index.max()
print("起始日期为",Min_date)
print("终止日期为",Max_date)
print("总共间隔时间为:",Max_date-Min_date) #包括了休息日和工作日
起始日期为 2019-01-28 00:00:00
终止日期为 2021-07-27 00:00:00
总共间隔时间为: 911 days 00:00:00
1
2
3
4
5
6
7
8
9
10
from plotly import tools
from plotly.graph_objs import *
from plotly.offline import init_notebook_mode, iplot, iplot_mpl
init_notebook_mode()
import chart_studio.plotly as py
import plotly.graph_objs as go

trace = go.Ohlc(x=df.index, open=df['open'], high=df['high'], low=df['low'], close=df['close'])
data = [trace]
iplot(data, filename='simple_ohlc')

image-20210727185408672

线性回归

1
2
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
1
2
3
4
# 创建新的列,填入预测值,根据当前的数据预测5天以后的收盘价
num = 5 #预测5天以后的情况
df['label'] = df['close'].shift(-num) #数据整体往上移动num格 也就是你这一天预测num天后的实际值
df.shape
(607, 15)
1
df.tail()

open high close low volume price_change p_change ma5 ma10 ma20 v_ma5 v_ma10 v_ma20 turnover label
date
2021-07-21 20.58 20.80 20.45 20.20 753076.50 -0.15 -0.73 21.044 21.104 21.881 721650.49 821114.23 754829.76 0.39 NaN
2021-07-22 20.45 20.66 20.38 20.24 868648.06 -0.07 -0.34 20.796 20.991 21.746 704179.49 762273.08 778747.99 0.45 NaN
2021-07-23 20.38 20.38 20.10 20.00 727404.31 -0.28 -1.37 20.548 20.874 21.583 735059.83 758777.04 787119.55 0.37 NaN
2021-07-26 19.96 19.99 18.80 18.43 1451304.00 -1.30 -6.47 20.066 20.638 21.384 920893.44 816117.33 828371.89 0.75 NaN
2021-07-27 18.87 18.99 17.76 17.60 1357551.50 -1.04 -5.53 19.498 20.302 21.155 1031596.87 886111.09 843761.99 0.70 NaN
**
**
1
2
3
# 删除‘label' 'price_change','p_change' 因为不需要他们进行预测
data = df.drop(['label','price_change','p_change'],axis=1) #删除这三列
data.tail()

open high close low volume ma5 ma10 ma20 v_ma5 v_ma10 v_ma20 turnover
date
2021-07-21 20.58 20.80 20.45 20.20 753076.50 21.044 21.104 21.881 721650.49 821114.23 754829.76 0.39
2021-07-22 20.45 20.66 20.38 20.24 868648.06 20.796 20.991 21.746 704179.49 762273.08 778747.99 0.45
2021-07-23 20.38 20.38 20.10 20.00 727404.31 20.548 20.874 21.583 735059.83 758777.04 787119.55 0.37
2021-07-26 19.96 19.99 18.80 18.43 1451304.00 20.066 20.638 21.384 920893.44 816117.33 828371.89 0.75
2021-07-27 18.87 18.99 17.76 17.60 1357551.50 19.498 20.302 21.155 1031596.87 886111.09 843761.99 0.70
**
**
1
2
3
4
5
6
7
8
9
10
11
12
13
# 处理特征值
X = data.values
X = preprocessing.scale(X) #标准化
# 对每个属性/每列来说所有数据都聚集在0附近,方差值为1。
X =X[:-num] #去掉后5天的值,因为要当作无后面5天值,要去预测

# 处理标签
df.dropna(inplace= True)
Target = df.label
y = Target.values

print(np.shape(X),np.shape(y))
X
(602, 12) (602,)





array([[-1.45811487, -1.47029285, -1.49070911, ..., -0.03578925,
        -0.04417715,  0.1441613 ],
       [-1.47985847, -1.48891936, -1.4743962 , ..., -0.32467616,
        -0.37352025, -0.27959612],
       [-1.48257641, -1.45964914, -1.48799029, ..., -0.52646157,
        -0.60356403, -0.52678795],
       ...,
       [ 1.36039855,  1.37157982,  1.33686078, ..., -0.61747576,
        -0.95329394, -0.91523224],
       [ 1.3468088 ,  1.26780357,  1.30151616, ..., -0.71057098,
        -0.98416008, -1.0211716 ],
       [ 1.27614212,  1.20660219,  1.13566831, ..., -0.70723675,
        -0.97687653, -0.52678795]])
1
2
3
4
5
6
7
8
9
10
##### 将数据分为训练数据和测试数据
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=2021,test_size=0.2) #设置随机数种子 设置测试集0.2
# #自己设置比例
# X_train,y_train = X[0:550,:],y[0:550]
# X_test,y_test = X[550:,:],y[550:602]
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
(481, 12)
(481,)
(121, 12)
(121,)
1
2
3
4
lr = LinearRegression() #逻辑回归模型
lr.fit(X_train,y_train)
lr.score(X_test,y_test) #使用绝对系数 R^2 评估模型
# 绝对系数越大越拟合
0.9537276319885583
1
2
3
4
5
6
# 做预测 
X_Predict = X[-num:]
Forecast = lr.predict(X_Predict)
print(Forecast) #预测最后5天的结果
print(y[-num:]) #实际最后5天的结果
print(X_Predict) #最后5天的特征
[20.58546463 21.30633579 21.06762328 20.90177206 20.47861039]
[20.45 20.38 20.1  18.8  17.76]
[[ 1.24896263  1.15338361  1.17916938  1.18973775 -0.38166387  1.29792419
   1.49107504  1.66146894 -0.31590711 -0.56046406 -1.01335038 -0.42084859]
 [ 1.17286005  1.34497053  1.41298766  1.2512624  -0.17701624  1.3039143
   1.44795391  1.64112995 -0.56317108 -0.48007827 -0.93423686 -0.244283  ]
 [ 1.36039855  1.37157982  1.33686078  1.43863295 -0.90240558  1.30772619
   1.43512674  1.62326461 -0.65661848 -0.61747576 -0.95329394 -0.91523224]
 [ 1.3468088   1.26780357  1.30151616  1.29321103 -0.99874522  1.31044897
   1.41192866  1.61007175 -0.83218332 -0.71057098 -0.98416008 -1.0211716 ]
 [ 1.27614212  1.20660219  1.13566831  1.2065172  -0.4648397   1.28213206
   1.35243241  1.57763929 -0.75992703 -0.70723675 -0.97687653 -0.52678795]]

画预测结果

1
2
3
# 预测2021-07-21 到 2021-07-27 ,一共5天的收盘价
trange = pd.date_range('2021-07-19',periods=num,freq='d')
trange
DatetimeIndex(['2021-07-19', '2021-07-20', '2021-07-21', '2021-07-22',
               '2021-07-23'],
              dtype='datetime64[ns]', freq='D')
1
2
3
4
# 产生预测值dataframe
Predict_df = pd.DataFrame(Forecast, index=trange)
Predict_df.columns = ['forecast']
Predict_df

forecast
2021-07-19 20.585465
2021-07-20 21.306336
2021-07-21 21.067623
2021-07-22 20.901772
2021-07-23 20.478610
**
**
1
2
3
4
5
6
7
8
9
10
# 将预测值添加到原始dataframe
df = pd.read_csv('./000001.csv')
df['date'] = pd.to_datetime(df['date'])
df = df.set_index('date')
# 按照时间升序排列
df.sort_values(by=['date'], inplace=True, ascending=True)
df_concat = pd.concat([df, Predict_df], axis=1)

df_concat = df_concat[df_concat.index.isin(Predict_df.index)]
df_concat.tail(num)

open high close low volume price_change p_change ma5 ma10 ma20 v_ma5 v_ma10 v_ma20 turnover forecast
2021-07-19 21.36 21.43 21.21 20.78 522135.97 -0.13 -0.61 21.210 21.532 22.134 711341.23 791493.36 737257.18 0.27 20.585465
2021-07-20 21.10 21.20 20.60 20.47 804034.31 -0.61 -2.88 21.106 21.314 22.016 740625.30 792701.07 739571.32 0.41 21.306336
2021-07-21 20.58 20.80 20.45 20.20 753076.50 -0.15 -0.73 21.044 21.104 21.881 721650.49 821114.23 754829.76 0.39 21.067623
2021-07-22 20.45 20.66 20.38 20.24 868648.06 -0.07 -0.34 20.796 20.991 21.746 704179.49 762273.08 778747.99 0.45 20.901772
2021-07-23 20.38 20.38 20.10 20.00 727404.31 -0.28 -1.37 20.548 20.874 21.583 735059.83 758777.04 787119.55 0.37 20.478610
**
**
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# Axes:坐标系 
#Subplot
fig = plt.figure(figsize=(16,9))
ax1 = fig.add_subplot(221) #221代表子图的位置 221:几行几列 第几个 2行2列第一个

ax2 = fig.add_subplot(222) #
ax3 = fig.add_subplot(212)
# ax4 = fig.add_subplot(224)
ax1.plot(df_concat['forecast'],linewidth=1,label='实际值',color="orange")
ax2.plot(df_concat['close'],linewidth=1,label='实际值',color="green")
ax1.set(title='预测结果',ylabel='预测值',xlabel='时间')
ax2.set(title='实际结果',ylabel='预测值',xlabel='时间')
# 画预测值和实际值

ax3 = df_concat['close'].plot(color='green', linewidth=1)
ax3 =df_concat['forecast'].plot(color='orange', linewidth=3)
plt.xlabel('时间')
plt.ylabel('价格')

plt.show()

png

权重特征

1
2
3
4
# 理解模型
for idx, col_name in enumerate(['open', 'high', 'close', 'low', 'volume', 'ma5', 'ma10', 'ma20', 'v_ma5', 'v_ma10', 'v_ma20']):
print("The coefficient for {} is {}".format(col_name, lr.coef_[idx]))
# 权重特征
The coefficient for open is -0.6740136248471393
The coefficient for high is -0.3406015889420868
The coefficient for close is 2.806477446216583
The coefficient for low is 0.8835154370662383
The coefficient for volume is 0.701804956701415
The coefficient for ma5 is 1.1450434713655455
The coefficient for ma10 is -0.14538595012110592
The coefficient for ma20 is -0.19140806216593198
The coefficient for v_ma5 is 0.050144303955445196
The coefficient for v_ma10 is -0.21247175374027985
The coefficient for v_ma20 is 0.04820620468948662

看课外阅读

顺便评个分吧!👇

KNN算法实战-预测二手汽车价格

Posted on 2021-07-26 | In 机器学习 , 算法

机器学习算法KNN

因为是预测价格,所以不需要使用到验证k,

1
2
3
4
5
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
1
2
3
4
#读取数据
df = pd.read_csv('data.csv')
df.head() # data frame
df

Brand Type Color Construction Year Odometer Ask Price Days Until MOT HP
0 Peugeot 106 1.0 blue 2002 166879 999 138 60
1 Peugeot 106 1.0 blue 1998 234484 999 346 60
2 Peugeot 106 1.1 black 1997 219752 500 -5 60
3 Peugeot 106 1.1 red 2001 223692 750 -87 60
4 Peugeot 106 1.1 grey 2002 120275 1650 356 59
5 Peugeot 106 1.1 red 2003 131358 1399 266 60
6 Peugeot 106 1.1 green 1999 304277 799 173 57
7 Peugeot 106 1.4 green 1998 93685 1300 0 75
8 Peugeot 106 1.1 white 2002 225935 950 113 60
9 Peugeot 106 1.4 green 1997 252319 650 133 75
10 Peugeot 106 1.0 black 1998 220000 700 82 50
11 Peugeot 106 1.1 black 1997 212000 700 75 60
12 Peugeot 106 1.1 black 2003 255134 799 197 60
**
**
1
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 8 columns):
Brand                13 non-null object
Type                 13 non-null float64
Color                13 non-null object
Construction Year    13 non-null int64
Odometer             13 non-null int64
Ask Price            13 non-null int64
Days Until MOT       13 non-null int64
HP                   13 non-null int64
dtypes: float64(1), int64(5), object(2)
memory usage: 912.0+ bytes
1
2
3
4
5
6
7
8
9
10
11
#清洗数据
# 把颜色独热编码
df_colors = df['Color'].str.get_dummies().add_prefix('Color: ') #构造独热编码
# 把类型独热编码
df_type = df['Type'].apply(str).str.get_dummies().add_prefix('Type: ') #把它当成分类
# 添加独热编码数据列
df = pd.concat([df, df_colors, df_type], axis=1)
# 去除独热编码对应的原始列
df = df.drop(['Brand', 'Type', 'Color'], axis=1)
df.head()
# brand 都一样,不需要

Construction Year Odometer Ask Price Days Until MOT HP Color: black Color: blue Color: green Color: grey Color: red Color: white Type: 1.0 Type: 1.1 Type: 1.4
0 2002 166879 999 138 60 0 1 0 0 0 0 1 0 0
1 1998 234484 999 346 60 0 1 0 0 0 0 1 0 0
2 1997 219752 500 -5 60 1 0 0 0 0 0 0 1 0
3 2001 223692 750 -87 60 0 0 0 0 1 0 0 1 0
4 2002 120275 1650 356 59 0 0 0 1 0 0 0 1 0
**
**
1
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 14 columns):
Construction Year    13 non-null int64
Odometer             13 non-null int64
Ask Price            13 non-null int64
Days Until MOT       13 non-null int64
HP                   13 non-null int64
Color: black         13 non-null int64
Color: blue          13 non-null int64
Color: green         13 non-null int64
Color: grey          13 non-null int64
Color: red           13 non-null int64
Color: white         13 non-null int64
Type: 1.0            13 non-null int64
Type: 1.1            13 non-null int64
Type: 1.4            13 non-null int64
dtypes: int64(14)
memory usage: 1.5 KB
1
2
3
4
5
6
# 数据转换
matrix = df.corr()
f, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(matrix, square=True)
plt.title('Car Price Variables')
# 观察特征关联性
Text(0.5,1,'Car Price Variables')

png

1
2
sns.pairplot(df[['Construction Year', 'Days Until MOT', 'Odometer', 'Ask Price']], size=2)
plt.show()

png

1
2
3
4
5
6
7
8
9
10
11
import pandas as pd
from sklearn import datasets
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA #降维
from mpl_toolkits.mplot3d import Axes3D #绘制3D图
# 主成分分析 将n维压缩成三维
pca_3 = PCA(n_components=3) #保留三组特征
# X = pca_3.fit_transform(df[['Construction Year','Odometer','Ask Price','Days Until MOT','HP','Color: black','Color: blue','Color: green','Color: grey','Color: red','Color: white','Type: 1.0','Type: 1.1','Type: 1.4']])
#PCA降维,看着使用,对比一下
X = df[['Construction Year', 'Days Until MOT', 'Odometer']] #只选3个特征。根据特征图选择
X

Construction Year Days Until MOT Odometer
0 2002 138 166879
1 1998 346 234484
2 1997 -5 219752
3 2001 -87 223692
4 2002 356 120275
5 2003 266 131358
6 1999 173 304277
7 1998 0 93685
8 2002 113 225935
9 1997 133 252319
10 1998 82 220000
11 1997 75 212000
12 2003 197 255134
**
**
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
import numpy as np
from collections import Counter #计数器,投票器
#他是一个数,所以使用回归,不是分类
y = df['Ask Price'].values.reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=41)

X_normalizer = StandardScaler() # N(0,1) 标准化
X_train = X_normalizer.fit_transform(X_train) #依赖fit后的数据再->transform
X_test = X_normalizer.transform(X_test)

y_normalizer = StandardScaler()
y_train = y_normalizer.fit_transform(y_train) # 训练数据需要fit 再transform
y_test = y_normalizer.transform(y_test)

knn = KNeighborsRegressor(n_neighbors=2) #回归,不是分类,所以可以是2
knn.fit(X_train, y_train.ravel())

#Now we can predict prices:
y_pred = knn.predict(X_test)
y_pred_inv = y_normalizer.inverse_transform(y_pred) #转换回正常的数据
y_test_inv = y_normalizer.inverse_transform(y_test) #转换回正常的数据

# Build a plot
plt.scatter(y_pred_inv, y_test_inv)
plt.xlabel('Prediction')
plt.ylabel('Real value')

# Now add the perfect prediction line
diagonal = np.linspace(500, 1500, 100) # 500-1500 划分100个
plt.plot(diagonal, diagonal, '-r')
plt.xlabel('Predicted ask price')
plt.ylabel('Ask price')
plt.show()
# 调整画大致的预测线

png

1
2
knn
X_test
array([[-0.97085465, -0.99146682, -3.07178611],
       [ 0.66426897,  1.71581745, -2.36134577],
       [-1.37963555,  0.01996242,  1.1666491 ],
       [-0.56207374,  0.32415166,  2.55487998],
       [-0.97085465, -0.36787887,  0.30313947]])
1
pred = knn.predict(X_test)
1
pred
array([ 1.24495111,  1.24495111, -1.14519542,  0.04788274, -1.14519542])
1
2
3
from sklearn.metrics import mean_absolute_error

mean_absolute_error(y_pred_inv, y_test_inv)
160.4
1
2
3
from sklearn.metrics import mean_squared_error

mean_squared_error(y_pred_inv, y_test_inv)
47220.4
1
y_pred_inv
array([1199., 1199.,  600.,  899.,  600.])
1
y_test_inv
array([[1300.],
       [1650.],
       [ 650.],
       [ 799.],
       [ 700.]])
1
2
3
4
5
6
x = 0
u = (y_test_inv - y_pred_inv).diagonal()
u
for i in range(0,4):
x += abs(u[i]/y_test_inv[i][0])
print("偏差率为:{:.4}".format(x/4))
偏差率为:0.1383

验证K

因为是算预测价格的回归分析,因此不用验证k,如果是分类的话需要验证K

顺便评个分吧!👇

K-Means算法分析与实现

Posted on 2021-07-24 | In 机器学习 , 算法

机器学习算法K-Means

在数据挖掘中,聚类是一个很重要的概念.

  • 物以类聚,人以群分

概述

1.聚类:

“类”指的是具有相似性的集合,聚类是指把数据集分为若干个类

2.K-Means:

是一种比较简单的迭代型聚类算法,采用距离作为相似性指标.

png

3.K-Means算法流程

  • 随机选择K个样本作为聚类中心

  • 计算每个样本与各个聚类中心的距离

  • 将各样本回归于与之最近的聚类中心

  • 求各个类的样本的均值,作为新的聚类中心

  • 判定:若类中心不发变动或者达到迭代次数,算法结束,否则回到第二步

4. K-Means举例

将a~d四个点聚为两类:
选定样本a和b为初始聚类中心,中心值分别为1、2

1615817983435

5. 代码演示

1
import numpy as np
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# 生成100样本点
x = np.linspace(0,99,100)
y = np.linspace(0,99,100)

k = 2 # 0 , 1
n=len(x)
dis = np.zeros([n,k+1])
# 1. 选择初始聚类中心
center1 = np.array([x[0],y[0]])
center2 = np.array([x[1],y[1]])

# 迭代次数100次
iter_ = 100

while iter_ > 0:
# 2. 求各样本点到两个聚类中心点的距离
for i in range(n):
dis[i,0] = np.sqrt((x[i]-center1[0])**2 + (y[i] - center1[1])**2)
dis[i,1] = np.sqrt((x[i]-center2[0])**2 + (y[i] - center2[1])**2)
# 3.归类
dis[i,2] = np.argmin(dis[i,:2]) #将值较小的下标复制dis[i,2]

# 4.求新的聚类中心
index1 = dis[:,2] == 0
index2 = dis[:,2] == 1

center1_new = np.array([x[index1].mean(),y[index1].mean()])
center2_new = np.array([x[index2].mean(),y[index2].mean()])

# 5.判定聚类中心是否发生改变
if all((center1 == center1_new) & (center2 == center2_new)):
# 没变换聚类中心则退出循环
break
center1 = center1_new
center2 = center2_new

# 6. 输出结果以验证
print(dis)
[[ 34.64823228 105.3589104    0.        ]
 [ 33.23401872 103.94469683   0.        ]
 [ 31.81980515 102.53048327   0.        ]
 [ 30.40559159 101.11626971   0.        ]
 [ 28.99137803  99.70205615   0.        ]
 [ 27.57716447  98.28784258   0.        ]
 [ 26.1629509   96.87362902   0.        ]
 [ 24.74873734  95.45941546   0.        ]
 [ 23.33452378  94.0452019    0.        ]
 [ 21.92031022  92.63098834   0.        ]
 [ 20.50609665  91.21677477   0.        ]
 [ 19.09188309  89.80256121   0.        ]
 [ 17.67766953  88.38834765   0.        ]
 [ 16.26345597  86.97413409   0.        ]
 [ 14.8492424   85.55992052   0.        ]
 [ 13.43502884  84.14570696   0.        ]
 [ 12.02081528  82.7314934    0.        ]
 [ 10.60660172  81.31727984   0.        ]
 [  9.19238816  79.90306627   0.        ]
 [  7.77817459  78.48885271   0.        ]
 [  6.36396103  77.07463915   0.        ]
 [  4.94974747  75.66042559   0.        ]
 [  3.53553391  74.24621202   0.        ]
 [  2.12132034  72.83199846   0.        ]
 [  0.70710678  71.4177849    0.        ]
 [  0.70710678  70.00357134   0.        ]
 [  2.12132034  68.58935778   0.        ]
 [  3.53553391  67.17514421   0.        ]
 [  4.94974747  65.76093065   0.        ]
 [  6.36396103  64.34671709   0.        ]
 [  7.77817459  62.93250353   0.        ]
 [  9.19238816  61.51828996   0.        ]
 [ 10.60660172  60.1040764    0.        ]
 [ 12.02081528  58.68986284   0.        ]
 [ 13.43502884  57.27564928   0.        ]
 [ 14.8492424   55.86143571   0.        ]
 [ 16.26345597  54.44722215   0.        ]
 [ 17.67766953  53.03300859   0.        ]
 [ 19.09188309  51.61879503   0.        ]
 [ 20.50609665  50.20458146   0.        ]
 [ 21.92031022  48.7903679    0.        ]
 [ 23.33452378  47.37615434   0.        ]
 [ 24.74873734  45.96194078   0.        ]
 [ 26.1629509   44.54772721   0.        ]
 [ 27.57716447  43.13351365   0.        ]
 [ 28.99137803  41.71930009   0.        ]
 [ 30.40559159  40.30508653   0.        ]
 [ 31.81980515  38.89087297   0.        ]
 [ 33.23401872  37.4766594    0.        ]
 [ 34.64823228  36.06244584   0.        ]
 [ 36.06244584  34.64823228   1.        ]
 [ 37.4766594   33.23401872   1.        ]
 [ 38.89087297  31.81980515   1.        ]
 [ 40.30508653  30.40559159   1.        ]
 [ 41.71930009  28.99137803   1.        ]
 [ 43.13351365  27.57716447   1.        ]
 [ 44.54772721  26.1629509    1.        ]
 [ 45.96194078  24.74873734   1.        ]
 [ 47.37615434  23.33452378   1.        ]
 [ 48.7903679   21.92031022   1.        ]
 [ 50.20458146  20.50609665   1.        ]
 [ 51.61879503  19.09188309   1.        ]
 [ 53.03300859  17.67766953   1.        ]
 [ 54.44722215  16.26345597   1.        ]
 [ 55.86143571  14.8492424    1.        ]
 [ 57.27564928  13.43502884   1.        ]
 [ 58.68986284  12.02081528   1.        ]
 [ 60.1040764   10.60660172   1.        ]
 [ 61.51828996   9.19238816   1.        ]
 [ 62.93250353   7.77817459   1.        ]
 [ 64.34671709   6.36396103   1.        ]
 [ 65.76093065   4.94974747   1.        ]
 [ 67.17514421   3.53553391   1.        ]
 [ 68.58935778   2.12132034   1.        ]
 [ 70.00357134   0.70710678   1.        ]
 [ 71.4177849    0.70710678   1.        ]
 [ 72.83199846   2.12132034   1.        ]
 [ 74.24621202   3.53553391   1.        ]
 [ 75.66042559   4.94974747   1.        ]
 [ 77.07463915   6.36396103   1.        ]
 [ 78.48885271   7.77817459   1.        ]
 [ 79.90306627   9.19238816   1.        ]
 [ 81.31727984  10.60660172   1.        ]
 [ 82.7314934   12.02081528   1.        ]
 [ 84.14570696  13.43502884   1.        ]
 [ 85.55992052  14.8492424    1.        ]
 [ 86.97413409  16.26345597   1.        ]
 [ 88.38834765  17.67766953   1.        ]
 [ 89.80256121  19.09188309   1.        ]
 [ 91.21677477  20.50609665   1.        ]
 [ 92.63098834  21.92031022   1.        ]
 [ 94.0452019   23.33452378   1.        ]
 [ 95.45941546  24.74873734   1.        ]
 [ 96.87362902  26.1629509    1.        ]
 [ 98.28784258  27.57716447   1.        ]
 [ 99.70205615  28.99137803   1.        ]
 [101.11626971  30.40559159   1.        ]
 [102.53048327  31.81980515   1.        ]
 [103.94469683  33.23401872   1.        ]
 [105.3589104   34.64823228   1.        ]]

具体案例

实现足球队的聚类

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
from sklearn.cluster import KMeans
from sklearn import preprocessing
import pandas as pd
import numpy as np

# 输入数据
data = pd.read_csv('data.csv',encoding='gbk')
train_x = data[['2019年国际排名','2018世界杯','2015亚洲杯']]
df = pd.DataFrame(train_x)
kmeans = KMeans(n_clusters=3)

min_max_scaler = preprocessing.MinMaxScaler() #归一化
train_x = min_max_scaler.fit_transform(train_x)
kmeans.fit(train_x)

predict_y = kmeans.predict(train_x)
result = pd.concat((data,pd.DataFrame(predict_y)),axis=1)
result.rename({0:u'聚类'},axis=1,inplace=True)
print(result)

结果:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
        国家  2019年国际排名  2018世界杯  2015亚洲杯  聚类
0 中国 73 40 7 0
1 日本 60 15 5 2
2 韩国 61 19 2 2
3 伊朗 34 18 6 2
4 沙特 67 26 10 2
5 伊拉克 91 40 4 0
6 卡塔尔 101 40 13 1
7 阿联酋 81 40 6 0
8 乌兹别克斯坦 88 40 8 0
9 泰国 122 40 17 1
10 越南 102 50 17 1
11 阿曼 87 50 12 1
12 巴林 116 50 11 1
13 朝鲜 110 50 14 1
14 印尼 164 50 17 1
15 澳洲 40 30 1 2
16 叙利亚 76 40 17 1
17 约旦 118 50 9 0
18 科威特 160 50 15 1
19 巴勒斯坦 96 50 16 1
顺便评个分吧!👇

KNN算法分析与实现

Posted on 2021-07-23 | In 机器学习 , 算法

机器学习算法KNN

k-近邻算法:

  • 最容易理解的算法
  • 最容易实现的算法

几个需要考虑的问题:

  1. 把一个物体表示成向量
  2. 标记好每个物体的标签
  3. 计算两个物体之间的距离/相似度
  4. 选择合适的k

选择合适的K:

使用交叉验证技术(把训练数据进一步分为训练数据和验证数据集,选择在验证数据里最好的超参数组合)

image-20210723232529264

核心手动实现

1
2
3
4
import numpy as np
from sklearn import datasets #自带数据集
from sklearn.model_selection import train_test_split #分割数据变成训练数据和测试数据
from collections import Counter #计数器,投票器
1
2
3
#导入数据
iris = datasets.load_iris()
iris #target 数据标签,代表不同的类的名称
{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
        [5.5, 4.2, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5. , 3.2, 1.2, 0.2],
        [5.5, 3.5, 1.3, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [4.4, 3. , 1.3, 0.2],
        [5.1, 3.4, 1.5, 0.2],
        [5. , 3.5, 1.3, 0.3],
        [4.5, 2.3, 1.3, 0.3],
        [4.4, 3.2, 1.3, 0.2],
        [5. , 3.5, 1.6, 0.6],
        [5.1, 3.8, 1.9, 0.4],
        [4.8, 3. , 1.4, 0.3],
        [5.1, 3.8, 1.6, 0.2],
        [4.6, 3.2, 1.4, 0.2],
        [5.3, 3.7, 1.5, 0.2],
        [5. , 3.3, 1.4, 0.2],
        [7. , 3.2, 4.7, 1.4],
        [6.4, 3.2, 4.5, 1.5],
        [6.9, 3.1, 4.9, 1.5],
        [5.5, 2.3, 4. , 1.3],
        [6.5, 2.8, 4.6, 1.5],
        [5.7, 2.8, 4.5, 1.3],
        [6.3, 3.3, 4.7, 1.6],
        [4.9, 2.4, 3.3, 1. ],
        [6.6, 2.9, 4.6, 1.3],
        [5.2, 2.7, 3.9, 1.4],
        [5. , 2. , 3.5, 1. ],
        [5.9, 3. , 4.2, 1.5],
        [6. , 2.2, 4. , 1. ],
        [6.1, 2.9, 4.7, 1.4],
        [5.6, 2.9, 3.6, 1.3],
        [6.7, 3.1, 4.4, 1.4],
        [5.6, 3. , 4.5, 1.5],
        [5.8, 2.7, 4.1, 1. ],
        [6.2, 2.2, 4.5, 1.5],
        [5.6, 2.5, 3.9, 1.1],
        [5.9, 3.2, 4.8, 1.8],
        [6.1, 2.8, 4. , 1.3],
        [6.3, 2.5, 4.9, 1.5],
        [6.1, 2.8, 4.7, 1.2],
        [6.4, 2.9, 4.3, 1.3],
        [6.6, 3. , 4.4, 1.4],
        [6.8, 2.8, 4.8, 1.4],
        [6.7, 3. , 5. , 1.7],
        [6. , 2.9, 4.5, 1.5],
        [5.7, 2.6, 3.5, 1. ],
        [5.5, 2.4, 3.8, 1.1],
        [5.5, 2.4, 3.7, 1. ],
        [5.8, 2.7, 3.9, 1.2],
        [6. , 2.7, 5.1, 1.6],
        [5.4, 3. , 4.5, 1.5],
        [6. , 3.4, 4.5, 1.6],
        [6.7, 3.1, 4.7, 1.5],
        [6.3, 2.3, 4.4, 1.3],
        [5.6, 3. , 4.1, 1.3],
        [5.5, 2.5, 4. , 1.3],
        [5.5, 2.6, 4.4, 1.2],
        [6.1, 3. , 4.6, 1.4],
        [5.8, 2.6, 4. , 1.2],
        [5. , 2.3, 3.3, 1. ],
        [5.6, 2.7, 4.2, 1.3],
        [5.7, 3. , 4.2, 1.2],
        [5.7, 2.9, 4.2, 1.3],
        [6.2, 2.9, 4.3, 1.3],
        [5.1, 2.5, 3. , 1.1],
        [5.7, 2.8, 4.1, 1.3],
        [6.3, 3.3, 6. , 2.5],
        [5.8, 2.7, 5.1, 1.9],
        [7.1, 3. , 5.9, 2.1],
        [6.3, 2.9, 5.6, 1.8],
        [6.5, 3. , 5.8, 2.2],
        [7.6, 3. , 6.6, 2.1],
        [4.9, 2.5, 4.5, 1.7],
        [7.3, 2.9, 6.3, 1.8],
        [6.7, 2.5, 5.8, 1.8],
        [7.2, 3.6, 6.1, 2.5],
        [6.5, 3.2, 5.1, 2. ],
        [6.4, 2.7, 5.3, 1.9],
        [6.8, 3. , 5.5, 2.1],
        [5.7, 2.5, 5. , 2. ],
        [5.8, 2.8, 5.1, 2.4],
        [6.4, 3.2, 5.3, 2.3],
        [6.5, 3. , 5.5, 1.8],
        [7.7, 3.8, 6.7, 2.2],
        [7.7, 2.6, 6.9, 2.3],
        [6. , 2.2, 5. , 1.5],
        [6.9, 3.2, 5.7, 2.3],
        [5.6, 2.8, 4.9, 2. ],
        [7.7, 2.8, 6.7, 2. ],
        [6.3, 2.7, 4.9, 1.8],
        [6.7, 3.3, 5.7, 2.1],
        [7.2, 3.2, 6. , 1.8],
        [6.2, 2.8, 4.8, 1.8],
        [6.1, 3. , 4.9, 1.8],
        [6.4, 2.8, 5.6, 2.1],
        [7.2, 3. , 5.8, 1.6],
        [7.4, 2.8, 6.1, 1.9],
        [7.9, 3.8, 6.4, 2. ],
        [6.4, 2.8, 5.6, 2.2],
        [6.3, 2.8, 5.1, 1.5],
        [6.1, 2.6, 5.6, 1.4],
        [7.7, 3. , 6.1, 2.3],
        [6.3, 3.4, 5.6, 2.4],
        [6.4, 3.1, 5.5, 1.8],
        [6. , 3. , 4.8, 1.8],
        [6.9, 3.1, 5.4, 2.1],
        [6.7, 3.1, 5.6, 2.4],
        [6.9, 3.1, 5.1, 2.3],
        [5.8, 2.7, 5.1, 1.9],
        [6.8, 3.2, 5.9, 2.3],
        [6.7, 3.3, 5.7, 2.5],
        [6.7, 3. , 5.2, 2.3],
        [6.3, 2.5, 5. , 1.9],
        [6.5, 3. , 5.2, 2. ],
        [6.2, 3.4, 5.4, 2.3],
        [5.9, 3. , 5.1, 1.8]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]),
 'target_names': array(['setosa', 'versicolor', 'virginica'], dtype='<U10'),
 'DESCR': 'Iris Plants Database\n====================\n\nNotes\n-----\nData Set Characteristics:\n    :Number of Instances: 150 (50 in each of three classes)\n    :Number of Attributes: 4 numeric, predictive attributes and the class\n    :Attribute Information:\n        - sepal length in cm\n        - sepal width in cm\n        - petal length in cm\n        - petal width in cm\n        - class:\n                - Iris-Setosa\n                - Iris-Versicolour\n                - Iris-Virginica\n    :Summary Statistics:\n\n    ============== ==== ==== ======= ===== ====================\n                    Min  Max   Mean    SD   Class Correlation\n    ============== ==== ==== ======= ===== ====================\n    sepal length:   4.3  7.9   5.84   0.83    0.7826\n    sepal width:    2.0  4.4   3.05   0.43   -0.4194\n    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)\n    petal width:    0.1  2.5   1.20  0.76     0.9565  (high!)\n    ============== ==== ==== ======= ===== ====================\n\n    :Missing Attribute Values: None\n    :Class Distribution: 33.3% for each of 3 classes.\n    :Creator: R.A. Fisher\n    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)\n    :Date: July, 1988\n\nThis is a copy of UCI ML iris datasets.\nhttp://archive.ics.uci.edu/ml/datasets/Iris\n\nThe famous Iris database, first used by Sir R.A Fisher\n\nThis is perhaps the best known database to be found in the\npattern recognition literature.  Fisher\'s paper is a classic in the field and\nis referenced frequently to this day.  (See Duda & Hart, for example.)  The\ndata set contains 3 classes of 50 instances each, where each class refers to a\ntype of iris plant.  One class is linearly separable from the other 2; the\nlatter are NOT linearly separable from each other.\n\nReferences\n----------\n   - Fisher,R.A. "The use of multiple measurements in taxonomic problems"\n     Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to\n     Mathematical Statistics" (John Wiley, NY, 1950).\n   - Duda,R.O., & Hart,P.E. (1973) Pattern Classification and Scene Analysis.\n     (Q327.D83) John Wiley & Sons.  ISBN 0-471-22361-1.  See page 218.\n   - Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System\n     Structure and Classification Rule for Recognition in Partially Exposed\n     Environments".  IEEE Transactions on Pattern Analysis and Machine\n     Intelligence, Vol. PAMI-2, No. 1, 67-71.\n   - Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule".  IEEE Transactions\n     on Information Theory, May 1972, 431-433.\n   - See also: 1988 MLC Proceedings, 54-64.  Cheeseman et al"s AUTOCLASS II\n     conceptual clustering system finds 3 classes in the data.\n   - Many, many more ...\n',
 'feature_names': ['sepal length (cm)',
  'sepal width (cm)',
  'petal length (cm)',
  'petal width (cm)']}
1
iris.data.shape #查看数据量,150个样本数据,4个特征
(150, 4)
1
2
3
4
5
# 4个特征
# 花萼长度 花萼宽度 花瓣长度 花瓣宽度
X = iris.data
# 山鸢尾 变色鸢尾 维吉尼亚鸢尾
y = iris.target
1
2
3
4
# 训练数据和测试数据
# 训练数据集特征 测试集特征 训练数据类别 测试数据类别
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=2003,test_size=0.2) #设置随机数种子 设置测试集0.2
#返回4个变量,即都是特征和类别
1
2
X_train[0]
X_train[1]
array([5.1, 3.8, 1.5, 0.3])
1
2
3
4
5
6
7
8
9
10
#求欧氏空间距离
def euc_dis(instance1,instance2):
"""
计算两个样本之间的欧式距离
instance1:第一个样本点
instance2:第二个样本点
return 距离
"""
dist = np.sqrt(sum((instance1 - instance2)**2)) #平方 = **2
return dist
1
2
3
4
5
6
from collections import Counter #投票器

a1 = "abcdad"

print(Counter(a1))
print(Counter(a1).most_common()[0][0]) #取最大的那个键
Counter({'a': 2, 'd': 2, 'b': 1, 'c': 1})
a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# 建模 
def knn_classify(X,y,testInstance,k=5):
"""
给定一个测试数据testInstance,通过KNN算法预测它的标签
X:训练数据的特征
y:训练数据的标签
testInstance : 测试数据 假定 一个测试数据array类型
k:选择多少个近邻点
"""
distances = [euc_dis(x,testInstance) for x in X] #返回的是距离长度如:[23,41,1,4,5]
# print(distances)
# 排序
kneighors = np.argsort(distances)[:k] #选择最近的三个点 #返回的是索引列表如: [2,3,4]
# print(kneighors)
count = Counter(y[kneighors])
# print(count)
return count.most_common()[0][0]
1
2
3
#测试 预测
pred = [knn_classify(X_train,y_train,data,5) for data in X_test]
print(pred)
[1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 2, 0, 2, 2, 2, 0, 1, 2, 1, 1, 0, 1, 2, 2, 1, 2]
1
2
3
# 模型验证
count = np.count_nonzero((pred==y_test)==True)
count #模型正确率猜对20道
27
1
print('模型预测正确率:%.3f'%(count/len(X_test)))
模型预测正确率:0.900

自带模型实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
from sklearn import datasets    
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier #knn分类器
import numpy as np


iris = datasets.load_iris()

X = iris.data
y = iris.target
print(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2003)
# 实例化KNN算法分类器
clf = KNeighborsClassifier(n_neighbors=3)
# 训练
clf.fit(X_train, y_train)
# 预测
correct = np.count_nonzero((clf.predict(X_test)==y_test)==True) #通过模型进行预测
print("Accuracy is: %.3f" %(correct/len(X_test)))
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 0.2]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.7 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 3.4 1.7 0.2]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.  0.2]
 [5.1 3.3 1.7 0.5]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.  3.4 1.6 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 3.4 1.4 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [5.4 3.4 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.5 4.2 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.  3.2 1.2 0.2]
 [5.5 3.5 1.3 0.2]
 [4.9 3.1 1.5 0.1]
 [4.4 3.  1.3 0.2]
 [5.1 3.4 1.5 0.2]
 [5.  3.5 1.3 0.3]
 [4.5 2.3 1.3 0.3]
 [4.4 3.2 1.3 0.2]
 [5.  3.5 1.6 0.6]
 [5.1 3.8 1.9 0.4]
 [4.8 3.  1.4 0.3]
 [5.1 3.8 1.6 0.2]
 [4.6 3.2 1.4 0.2]
 [5.3 3.7 1.5 0.2]
 [5.  3.3 1.4 0.2]
 [7.  3.2 4.7 1.4]
 [6.4 3.2 4.5 1.5]
 [6.9 3.1 4.9 1.5]
 [5.5 2.3 4.  1.3]
 [6.5 2.8 4.6 1.5]
 [5.7 2.8 4.5 1.3]
 [6.3 3.3 4.7 1.6]
 [4.9 2.4 3.3 1. ]
 [6.6 2.9 4.6 1.3]
 [5.2 2.7 3.9 1.4]
 [5.  2.  3.5 1. ]
 [5.9 3.  4.2 1.5]
 [6.  2.2 4.  1. ]
 [6.1 2.9 4.7 1.4]
 [5.6 2.9 3.6 1.3]
 [6.7 3.1 4.4 1.4]
 [5.6 3.  4.5 1.5]
 [5.8 2.7 4.1 1. ]
 [6.2 2.2 4.5 1.5]
 [5.6 2.5 3.9 1.1]
 [5.9 3.2 4.8 1.8]
 [6.1 2.8 4.  1.3]
 [6.3 2.5 4.9 1.5]
 [6.1 2.8 4.7 1.2]
 [6.4 2.9 4.3 1.3]
 [6.6 3.  4.4 1.4]
 [6.8 2.8 4.8 1.4]
 [6.7 3.  5.  1.7]
 [6.  2.9 4.5 1.5]
 [5.7 2.6 3.5 1. ]
 [5.5 2.4 3.8 1.1]
 [5.5 2.4 3.7 1. ]
 [5.8 2.7 3.9 1.2]
 [6.  2.7 5.1 1.6]
 [5.4 3.  4.5 1.5]
 [6.  3.4 4.5 1.6]
 [6.7 3.1 4.7 1.5]
 [6.3 2.3 4.4 1.3]
 [5.6 3.  4.1 1.3]
 [5.5 2.5 4.  1.3]
 [5.5 2.6 4.4 1.2]
 [6.1 3.  4.6 1.4]
 [5.8 2.6 4.  1.2]
 [5.  2.3 3.3 1. ]
 [5.6 2.7 4.2 1.3]
 [5.7 3.  4.2 1.2]
 [5.7 2.9 4.2 1.3]
 [6.2 2.9 4.3 1.3]
 [5.1 2.5 3.  1.1]
 [5.7 2.8 4.1 1.3]
 [6.3 3.3 6.  2.5]
 [5.8 2.7 5.1 1.9]
 [7.1 3.  5.9 2.1]
 [6.3 2.9 5.6 1.8]
 [6.5 3.  5.8 2.2]
 [7.6 3.  6.6 2.1]
 [4.9 2.5 4.5 1.7]
 [7.3 2.9 6.3 1.8]
 [6.7 2.5 5.8 1.8]
 [7.2 3.6 6.1 2.5]
 [6.5 3.2 5.1 2. ]
 [6.4 2.7 5.3 1.9]
 [6.8 3.  5.5 2.1]
 [5.7 2.5 5.  2. ]
 [5.8 2.8 5.1 2.4]
 [6.4 3.2 5.3 2.3]
 [6.5 3.  5.5 1.8]
 [7.7 3.8 6.7 2.2]
 [7.7 2.6 6.9 2.3]
 [6.  2.2 5.  1.5]
 [6.9 3.2 5.7 2.3]
 [5.6 2.8 4.9 2. ]
 [7.7 2.8 6.7 2. ]
 [6.3 2.7 4.9 1.8]
 [6.7 3.3 5.7 2.1]
 [7.2 3.2 6.  1.8]
 [6.2 2.8 4.8 1.8]
 [6.1 3.  4.9 1.8]
 [6.4 2.8 5.6 2.1]
 [7.2 3.  5.8 1.6]
 [7.4 2.8 6.1 1.9]
 [7.9 3.8 6.4 2. ]
 [6.4 2.8 5.6 2.2]
 [6.3 2.8 5.1 1.5]
 [6.1 2.6 5.6 1.4]
 [7.7 3.  6.1 2.3]
 [6.3 3.4 5.6 2.4]
 [6.4 3.1 5.5 1.8]
 [6.  3.  4.8 1.8]
 [6.9 3.1 5.4 2.1]
 [6.7 3.1 5.6 2.4]
 [6.9 3.1 5.1 2.3]
 [5.8 2.7 5.1 1.9]
 [6.8 3.2 5.9 2.3]
 [6.7 3.3 5.7 2.5]
 [6.7 3.  5.2 2.3]
 [6.3 2.5 5.  1.9]
 [6.5 3.  5.2 2. ]
 [6.2 3.4 5.4 2.3]
 [5.9 3.  5.1 1.8]] [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
Accuracy is: 0.921

怎么选择合适的K

1
2
3
4
5
6
import numpy as np
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体
plt.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
1
2
3
4
5
6
iris = load_iris()
# 特征
data = iris.data[:,:2] #挑选前两个特征
print(data)
# 标签
target =iris.target
[[5.1 3.5]
 [4.9 3. ]
 [4.7 3.2]
 [4.6 3.1]
 [5.  3.6]
 [5.4 3.9]
 [4.6 3.4]
 [5.  3.4]
 [4.4 2.9]
 [4.9 3.1]
 [5.4 3.7]
 [4.8 3.4]
 [4.8 3. ]
 [4.3 3. ]
 [5.8 4. ]
 [5.7 4.4]
 [5.4 3.9]
 [5.1 3.5]
 [5.7 3.8]
 [5.1 3.8]
 [5.4 3.4]
 [5.1 3.7]
 [4.6 3.6]
 [5.1 3.3]
 [4.8 3.4]
 [5.  3. ]
 [5.  3.4]
 [5.2 3.5]
 [5.2 3.4]
 [4.7 3.2]
 [4.8 3.1]
 [5.4 3.4]
 [5.2 4.1]
 [5.5 4.2]
 [4.9 3.1]
 [5.  3.2]
 [5.5 3.5]
 [4.9 3.1]
 [4.4 3. ]
 [5.1 3.4]
 [5.  3.5]
 [4.5 2.3]
 [4.4 3.2]
 [5.  3.5]
 [5.1 3.8]
 [4.8 3. ]
 [5.1 3.8]
 [4.6 3.2]
 [5.3 3.7]
 [5.  3.3]
 [7.  3.2]
 [6.4 3.2]
 [6.9 3.1]
 [5.5 2.3]
 [6.5 2.8]
 [5.7 2.8]
 [6.3 3.3]
 [4.9 2.4]
 [6.6 2.9]
 [5.2 2.7]
 [5.  2. ]
 [5.9 3. ]
 [6.  2.2]
 [6.1 2.9]
 [5.6 2.9]
 [6.7 3.1]
 [5.6 3. ]
 [5.8 2.7]
 [6.2 2.2]
 [5.6 2.5]
 [5.9 3.2]
 [6.1 2.8]
 [6.3 2.5]
 [6.1 2.8]
 [6.4 2.9]
 [6.6 3. ]
 [6.8 2.8]
 [6.7 3. ]
 [6.  2.9]
 [5.7 2.6]
 [5.5 2.4]
 [5.5 2.4]
 [5.8 2.7]
 [6.  2.7]
 [5.4 3. ]
 [6.  3.4]
 [6.7 3.1]
 [6.3 2.3]
 [5.6 3. ]
 [5.5 2.5]
 [5.5 2.6]
 [6.1 3. ]
 [5.8 2.6]
 [5.  2.3]
 [5.6 2.7]
 [5.7 3. ]
 [5.7 2.9]
 [6.2 2.9]
 [5.1 2.5]
 [5.7 2.8]
 [6.3 3.3]
 [5.8 2.7]
 [7.1 3. ]
 [6.3 2.9]
 [6.5 3. ]
 [7.6 3. ]
 [4.9 2.5]
 [7.3 2.9]
 [6.7 2.5]
 [7.2 3.6]
 [6.5 3.2]
 [6.4 2.7]
 [6.8 3. ]
 [5.7 2.5]
 [5.8 2.8]
 [6.4 3.2]
 [6.5 3. ]
 [7.7 3.8]
 [7.7 2.6]
 [6.  2.2]
 [6.9 3.2]
 [5.6 2.8]
 [7.7 2.8]
 [6.3 2.7]
 [6.7 3.3]
 [7.2 3.2]
 [6.2 2.8]
 [6.1 3. ]
 [6.4 2.8]
 [7.2 3. ]
 [7.4 2.8]
 [7.9 3.8]
 [6.4 2.8]
 [6.3 2.8]
 [6.1 2.6]
 [7.7 3. ]
 [6.3 3.4]
 [6.4 3.1]
 [6.  3. ]
 [6.9 3.1]
 [6.7 3.1]
 [6.9 3.1]
 [5.8 2.7]
 [6.8 3.2]
 [6.7 3.3]
 [6.7 3. ]
 [6.3 2.5]
 [6.5 3. ]
 [6.2 3.4]
 [5.9 3. ]]
1
2
3
4
5
6
7
8
9
10
11
label = np.array(target)  #2维是一堆
index_0 = np.where(label==0) #找类型的数据
plt.scatter(data[index_0,0],data[index_0,1],marker='x',color='b',label='0',s=15) #蓝色
index_1 = np.where(label==1)
plt.scatter(data[index_1,0],data[index_1,1],marker='o',color='r',label='1',s=15) #红色
index_2 = np.where(label==2)
plt.scatter(data[index_2,0],data[index_2,1],marker='s',color='g',label='2',s=15) #绿色
plt.xlabel('X1')
plt.xlabel('X2')
plt.legend(loc='upper left')
plt.show()

png

1
2
3
4
5
import pandas as pd
from sklearn import datasets
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA #降维
from mpl_toolkits.mplot3d import Axes3D #绘制3D图
1
2
3
iris = datasets.load_iris()
data = iris.data
target = iris.target
1
2
3
4
# 主成分分析  将四维压缩成三维
pca_3 = PCA(n_components=3) #保留三组特征
data_pca_3 = pca_3.fit_transform(data)
data_pca_3
array([[-2.68420713,  0.32660731, -0.02151184],
       [-2.71539062, -0.16955685, -0.20352143],
       [-2.88981954, -0.13734561,  0.02470924],
       [-2.7464372 , -0.31112432,  0.03767198],
       [-2.72859298,  0.33392456,  0.0962297 ],
       [-2.27989736,  0.74778271,  0.17432562],
       [-2.82089068, -0.08210451,  0.26425109],
       [-2.62648199,  0.17040535, -0.01580151],
       [-2.88795857, -0.57079803,  0.02733541],
       [-2.67384469, -0.1066917 , -0.1915333 ],
       [-2.50652679,  0.65193501, -0.069275  ],
       [-2.61314272,  0.02152063,  0.10765035],
       [-2.78743398, -0.22774019, -0.20032779],
       [-3.22520045, -0.50327991,  0.06841363],
       [-2.64354322,  1.1861949 , -0.1445057 ],
       [-2.38386932,  1.34475434,  0.28373066],
       [-2.6225262 ,  0.81808967,  0.14531599],
       [-2.64832273,  0.31913667,  0.03339425],
       [-2.19907796,  0.87924409, -0.11452146],
       [-2.58734619,  0.52047364,  0.21957209],
       [-2.3105317 ,  0.39786782, -0.23369561],
       [-2.54323491,  0.44003175,  0.21483637],
       [-3.21585769,  0.14161557,  0.29961898],
       [-2.30312854,  0.10552268,  0.04568004],
       [-2.35617109, -0.03120959,  0.12940758],
       [-2.50791723, -0.13905634, -0.24711634],
       [-2.469056  ,  0.13788731,  0.10126308],
       [-2.56239095,  0.37468456, -0.07235916],
       [-2.63982127,  0.31929007, -0.13925337],
       [-2.63284791, -0.19007583,  0.04646646],
       [-2.58846205, -0.19739308, -0.07127507],
       [-2.41007734,  0.41808001, -0.13838824],
       [-2.64763667,  0.81998263,  0.2305856 ],
       [-2.59715948,  1.10002193,  0.16358191],
       [-2.67384469, -0.1066917 , -0.1915333 ],
       [-2.86699985,  0.0771931 , -0.15684235],
       [-2.62522846,  0.60680001, -0.26116316],
       [-2.67384469, -0.1066917 , -0.1915333 ],
       [-2.98184266, -0.48025005,  0.07972481],
       [-2.59032303,  0.23605934, -0.07390124],
       [-2.77013891,  0.27105942,  0.08424157],
       [-2.85221108, -0.93286537, -0.34096149],
       [-2.99829644, -0.33430757,  0.19900842],
       [-2.4055141 ,  0.19591726,  0.27071707],
       [-2.20883295,  0.44269603,  0.30348781],
       [-2.71566519, -0.24268148, -0.09051561],
       [-2.53757337,  0.51036755,  0.1719184 ],
       [-2.8403213 , -0.22057634,  0.09006138],
       [-2.54268576,  0.58628103, -0.01117527],
       [-2.70391231,  0.11501085, -0.08269573],
       [ 1.28479459,  0.68543919, -0.40612955],
       [ 0.93241075,  0.31919809, -0.01712991],
       [ 1.46406132,  0.50418983, -0.33826073],
       [ 0.18096721, -0.82560394, -0.17708286],
       [ 1.08713449,  0.07539039, -0.30654446],
       [ 0.64043675, -0.41732348,  0.04118877],
       [ 1.09522371,  0.28389121,  0.17002253],
       [-0.75146714, -1.00110751,  0.01567219],
       [ 1.04329778,  0.22895691, -0.41481457],
       [-0.01019007, -0.72057487,  0.28343725],
       [-0.5110862 , -1.26249195, -0.26648995],
       [ 0.51109806, -0.10228411,  0.13232789],
       [ 0.26233576, -0.5478933 , -0.69194158],
       [ 0.98404455, -0.12436042, -0.06215743],
       [-0.174864  , -0.25181557,  0.09365864],
       [ 0.92757294,  0.46823621, -0.3132294 ],
       [ 0.65959279, -0.35197629,  0.3283843 ],
       [ 0.23454059, -0.33192183, -0.27028067],
       [ 0.94236171, -0.54182226, -0.49734854],
       [ 0.0432464 , -0.58148945, -0.23296356],
       [ 1.11624072, -0.08421401,  0.45984423],
       [ 0.35678657, -0.06682383, -0.22747218],
       [ 1.29646885, -0.32756152, -0.34751321],
       [ 0.92050265, -0.18239036, -0.23161142],
       [ 0.71400821,  0.15037915, -0.32037233],
       [ 0.89964086,  0.32961098, -0.31477148],
       [ 1.33104142,  0.24466952, -0.52124492],
       [ 1.55739627,  0.26739258, -0.16463849],
       [ 0.81245555, -0.16233157,  0.03634358],
       [-0.30733476, -0.36508661, -0.3153372 ],
       [-0.07034289, -0.70253793, -0.24175804],
       [-0.19188449, -0.67749054, -0.30391654],
       [ 0.13499495, -0.31170964, -0.1749733 ],
       [ 1.37873698, -0.42120514,  0.0154805 ],
       [ 0.58727485, -0.48328427,  0.44458375],
       [ 0.8072055 ,  0.19505396,  0.38945871],
       [ 1.22042897,  0.40803534, -0.23656609],
       [ 0.81286779, -0.370679  , -0.61287105],
       [ 0.24519516, -0.26672804,  0.18956248],
       [ 0.16451343, -0.67966147, -0.05779924],
       [ 0.46303099, -0.66952655, -0.02405389],
       [ 0.89016045, -0.03381244, -0.00976803],
       [ 0.22887905, -0.40225762, -0.22736271],
       [-0.70708128, -1.00842476, -0.10206934],
       [ 0.35553304, -0.50321849,  0.01788947],
       [ 0.33112695, -0.21118014,  0.08380907],
       [ 0.37523823, -0.29162202,  0.07907336],
       [ 0.64169028,  0.01907118, -0.20417288],
       [-0.90846333, -0.75156873, -0.00773658],
       [ 0.29780791, -0.34701652,  0.01217914],
       [ 2.53172698, -0.01184224,  0.75845865],
       [ 1.41407223, -0.57492506,  0.29639822],
       [ 2.61648461,  0.34193529, -0.11214137],
       [ 1.97081495, -0.18112569,  0.10653915],
       [ 2.34975798, -0.04188255,  0.28411068],
       [ 3.39687992,  0.54716805, -0.35187316],
       [ 0.51938325, -1.19135169,  0.54668553],
       [ 2.9320051 ,  0.35237701, -0.42369128],
       [ 2.31967279, -0.24554817, -0.34992218],
       [ 2.91813423,  0.78038063,  0.42173893],
       [ 1.66193495,  0.2420384 ,  0.24281526],
       [ 1.80234045, -0.21615461, -0.03769533],
       [ 2.16537886,  0.21528028,  0.03314818],
       [ 1.34459422, -0.77641543,  0.28286802],
       [ 1.5852673 , -0.53930705,  0.63057049],
       [ 1.90474358,  0.11881899,  0.48013808],
       [ 1.94924878,  0.04073026,  0.04272909],
       [ 3.48876538,  1.17154454,  0.12932008],
       [ 3.79468686,  0.25326557, -0.51697072],
       [ 1.29832982, -0.76101394, -0.34488705],
       [ 2.42816726,  0.37678197,  0.21864907],
       [ 1.19809737, -0.60557896,  0.51264077],
       [ 3.49926548,  0.45677347, -0.57691019],
       [ 1.38766825, -0.20403099, -0.06351132],
       [ 2.27585365,  0.33338653,  0.28467815],
       [ 2.61419383,  0.55836695, -0.20842335],
       [ 1.25762518, -0.179137  ,  0.04697781],
       [ 1.29066965, -0.11642525,  0.23161356],
       [ 2.12285398, -0.21085488,  0.15351589],
       [ 2.3875644 ,  0.46251925, -0.45202396],
       [ 2.84096093,  0.37274259, -0.50103154],
       [ 3.2323429 ,  1.37052404, -0.11844878],
       [ 2.15873837, -0.21832553,  0.20842198],
       [ 1.4431026 , -0.14380129, -0.15408297],
       [ 1.77964011, -0.50146479, -0.17581119],
       [ 3.07652162,  0.68576444, -0.33642274],
       [ 2.14498686,  0.13890661,  0.73418474],
       [ 1.90486293,  0.04804751,  0.16047063],
       [ 1.16885347, -0.1645025 ,  0.28246088],
       [ 2.10765373,  0.37148225,  0.02743786],
       [ 2.31430339,  0.18260885,  0.3228604 ],
       [ 1.92245088,  0.40927118,  0.11549282],
       [ 1.41407223, -0.57492506,  0.29639822],
       [ 2.56332271,  0.2759745 ,  0.29125361],
       [ 2.41939122,  0.30350394,  0.50430252],
       [ 1.94401705,  0.18741522,  0.17930287],
       [ 1.52566363, -0.37502085, -0.12063644],
       [ 1.76404594,  0.07851919,  0.13078405],
       [ 1.90162908,  0.11587675,  0.72287356],
       [ 1.38966613, -0.28288671,  0.36231783]])
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# 绘图
colors = {0:'r',1:'b',2:'k'}
markers = {0:'x',1:'D',2:'o'}

#
fig = plt.figure(1,figsize=(8,6))
ax = Axes3D(fig,elev=-150,azim=-110) #画板 刻度范围

data_pca_gb =pd.DataFrame(data_pca_3).groupby(target) #根据标签来进行分组

for g in data_pca_gb.groups:
ax.scatter(
data_pca_gb.get_group(g)[0],
data_pca_gb.get_group(g)[1],
data_pca_gb.get_group(g)[2],
c=colors[g],
marker=markers[g],
cmap=plt.cm.Paired
)
plt.show()

#三维发现是可以区分的,能做

png

验证K

1
2
3
4
5
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

# 训练数据特征 测试数据特征 训练数据标签 测试数据标签
X, X_test, y, y_test = train_test_split(data, target, test_size=0.2)
1
2
3
4
5
6
k_choices = [1,3,5,7,9,13,15,21,25,27,29,31]
X_folds = np.vsplit(X,4) #训练数据分割成4份
y_folds = np.hsplit(y,4) #测试数据分割成4份 ,y标签是一维的,所以只能用hsplit
y_folds
# 分成4份是因为当k=1时,可能正确率会随机变化,因此分成4分,进行4次验证同一个k值,保证准确率

[array([0, 0, 2, 0, 2, 1, 0, 0, 0, 1, 2, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 2,
        1, 1, 2, 1, 0, 2, 0, 2]),
 array([2, 0, 0, 1, 2, 1, 2, 0, 2, 2, 2, 0, 1, 0, 2, 1, 2, 0, 2, 1, 1, 0,
        0, 1, 0, 0, 2, 0, 0, 2]),
 array([0, 1, 2, 1, 1, 1, 0, 1, 2, 2, 2, 0, 0, 0, 2, 0, 2, 1, 1, 2, 1, 2,
        0, 1, 1, 0, 1, 0, 0, 0]),
 array([0, 2, 2, 2, 0, 0, 0, 1, 2, 2, 0, 2, 1, 1, 1, 2, 1, 1, 0, 0, 0, 2,
        2, 2, 2, 1, 1, 2, 0, 1])]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
accuracy_of_k = {}  #创建容器来存结果
accuracy_of_k = {}
for k in k_choices:
accuracy_of_k[k]=[]
for i in range(4):
X_train = np.vstack(X_folds[:i]+X_folds[i+1:])
X_val = X_folds[i]
y_train = np.hstack(y_folds[:i]+y_folds[i+1:])
y_val = y_folds[i]
print(X_train.shape,X_val.shape,y_train.shape,y_val.shape)
for k in k_choices:
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train,y_train)
y_val_pred = knn.predict(X_val) # 1 2 1
accuracy = np.mean(y_val_pred==y_val) # k = 1 78% 80% 90% 60%
accuracy_of_k[k].append(accuracy) #{'k1':'78%','k3':''}

for k in sorted(k_choices):
for accuracy in accuracy_of_k[k]:
print('k=%d,accuracy=%f'%(k,accuracy))
(90, 4) (30, 4) (90,) (30,)
(90, 4) (30, 4) (90,) (30,)
(90, 4) (30, 4) (90,) (30,)
(90, 4) (30, 4) (90,) (30,)
k=1,accuracy=0.933333
k=1,accuracy=0.966667
k=1,accuracy=1.000000
k=1,accuracy=0.866667
k=3,accuracy=0.933333
k=3,accuracy=1.000000
k=3,accuracy=1.000000
k=3,accuracy=0.933333
k=5,accuracy=0.933333
k=5,accuracy=1.000000
k=5,accuracy=0.966667
k=5,accuracy=0.966667
k=7,accuracy=0.933333
k=7,accuracy=1.000000
k=7,accuracy=0.933333
k=7,accuracy=0.966667
k=9,accuracy=0.966667
k=9,accuracy=1.000000
k=9,accuracy=0.933333
k=9,accuracy=0.933333
k=13,accuracy=0.966667
k=13,accuracy=1.000000
k=13,accuracy=0.966667
k=13,accuracy=0.933333
k=15,accuracy=0.966667
k=15,accuracy=1.000000
k=15,accuracy=0.966667
k=15,accuracy=0.933333
k=21,accuracy=0.966667
k=21,accuracy=1.000000
k=21,accuracy=0.933333
k=21,accuracy=0.900000
k=25,accuracy=0.966667
k=25,accuracy=1.000000
k=25,accuracy=0.966667
k=25,accuracy=0.866667
k=27,accuracy=0.966667
k=27,accuracy=1.000000
k=27,accuracy=0.966667
k=27,accuracy=0.900000
k=29,accuracy=0.966667
k=29,accuracy=1.000000
k=29,accuracy=0.933333
k=29,accuracy=0.866667
k=31,accuracy=0.966667
k=31,accuracy=1.000000
k=31,accuracy=0.933333
k=31,accuracy=0.866667
1
2
3
4
5
6
7
8
9
for k in k_choices:
plt.scatter([k]*len(accuracy_of_k[k]),accuracy_of_k[k])
accuracies_mean = np.array([np.mean(v) for k,v in sorted(accuracy_of_k.items())])
accuracies_std = np.array([np.std(v) for k ,v in sorted(accuracy_of_k.items())])
plt.errorbar(k_choices,accuracies_mean,yerr=accuracies_std)
plt.title('在k上进行交叉验证')
plt.xlabel('k')
plt.ylabel('交叉验证准确性')
plt.show()

png

1
2
3
4
5
6
7
best_k = 10
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X_train,y_train)
y_test_pred = knn.predict(X_test)
num_correct = np.sum(y_test==y_test_pred) # 测试正确的数量
accuracy_test = np.mean(y_test==y_test_pred) # 测试集 30
print('test accuracy is %d/%d = %f'%(num_correct,X_test.shape[0],accuracy_test))
test accuracy is 28/30 = 0.933333
顺便评个分吧!👇

Pyecharts可视化(二)

Posted on 2021-07-21 | In 数据可视化 , Pyecharts

数据分析数据可视化Pyecharts
点击我查看效果

柱形图

1
2
3
4
5
6
7
8
9
10
11
12
13
from pyecharts import options as opts
from pyecharts.charts import Bar

bar1 = (
# 图表类型
Bar()
.add_xaxis(["衬衫", "羊毛衫", "雪纺衫", "裤子", "高跟鞋", "袜子"])
.add_yaxis("商家A", [5, 20, 36, 10, 75, 90])
.add_yaxis("商家B", [20, 30, 41, 15, 45, 100])
# 配置项内容
.set_global_opts(title_opts=opts.TitleOpts(title="主标题", subtitle="副标题"))
)
bar1.render_notebook()
    <div id="483573e30f2f416da05811d91f667a50" style="width:900px; height:500px;"></div>

横向柱形图

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
from pyecharts import options as opts
from pyecharts.charts import Bar

def bar_reversal_axis(): # 返回的Bar的类实例
bar1 = (
# 图表类型
Bar()
.add_xaxis(["衬衫", "羊毛衫", "雪纺衫", "裤子", "高跟鞋", "袜子"])
.add_yaxis("商家A", [5, 20, 36, 10, 75, 90])
.add_yaxis("商家B", [20, 30, 41, 15, 45, 100])
# 翻转
.reversal_axis()
# 系列配置项
.set_series_opts(label_opts=opts.LabelOpts(position='right'))
# 配置项内容
.set_global_opts(title_opts=opts.TitleOpts(title="主标题", subtitle="副标题"))
)
return bar1
1
2
barh = bar_reversal_axis()
barh.render_notebook()
    <div id="fdc5d30829da4fca9687d275cfc80101" style="width:900px; height:500px;"></div>

折线图

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
#可以用工具箱将柱形转换为折线,或者直接修改名称
from pyecharts import options as opts
from pyecharts.charts import Line

x = ['2018-{:0>2d}'.format(s) for s in range(1,13)]
y1 = [5,10,26,30,35,30,20,26,40,46,40,50]
y2 = [8,20,24,36,40,36,40,45,50,53,48,58]

bar1 = (
# 图表类型
Line()
.add_xaxis(x)
.add_yaxis("基金A", y1)
.add_yaxis("基金B", y2)
# 配置项内容
.set_global_opts(title_opts=opts.TitleOpts(title="主标题", subtitle="副标题"))
)
bar1.render_notebook()
    <div id="8b01a8f8d42249c0a0225af6e160f0dd" style="width:900px; height:500px;"></div>

散点图

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
from pyecharts.charts import Scatter
import numpy as np
import pandas as pd

df = pd.DataFrame()
df['weight'] = np.array([56,67,65,70,57,60,80,85,76,64],dtype='int32') #数组对象,必须是整型
df['height'] = np.array([162,170,168,172,168,172,180,176,178,170],dtype='int32')
df['height_m'] = [150,160,164,170,160,158,169,173,171,179]

scatter = Scatter()
scatter.add_xaxis(df['weight']) #散点图才可以是Series对象,其他的不行只能列表,Series需要进行转化
scatter.add_yaxis('男生',df['height'])
scatter.add_yaxis('女生',df['height_m'])
scatter.set_global_opts(title_opts=opts.TitleOpts(title="Scatter-基本示例"))
scatter.render_notebook()
    <div id="34892aa5965b4c6da5ccc50700698d7f" style="width:900px; height:500px;"></div>

散点图数据支持series类型

箱线图

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
from pyecharts import options as opts
from pyecharts.charts import Boxplot

v1 = [
[850, 740, 900, 1070, 930, 850, 950, 980, 980, 880]
+ [1000, 980, 930, 650, 760, 810, 1000, 1000, 960, 960],
[960, 940, 960, 940, 880, 800, 850, 880, 900]
+ [840, 830, 790, 810, 880, 880, 830, 800, 790, 760, 800],
]

v2 = [
[890, 810, 810, 820, 800, 770, 760, 740, 750, 760]
+ [910, 920, 890, 860, 880, 720, 840, 850, 850, 780],
[890, 840, 780, 810, 760, 810, 790, 810, 820, 850, 870]
+ [870, 810, 740, 810, 940, 950, 800, 810, 870],
]


c = Boxplot() #箱线图的名称
c.add_xaxis(["expr1", "expr2"])
c.add_yaxis("A", c.prepare_data(v1)) #实线是中位数 必须加入list.prepare_data,因为他不是dataframe对象
c.add_yaxis(
"B", c.prepare_data(v2) # 计算 中位数 最小值 最大值...
)
c.set_global_opts(title_opts=opts.TitleOpts(title="BoxPlot-基本示例"))
c.render_notebook()
#数据格式最好要和上面一样
    <div id="8d0a1536a9d24abe88f3c6977b6f4157" style="width:900px; height:500px;"></div>

饼图

1
2
3
4
5
6
7
8
9
10
11
12
from pyecharts import options as opts
from pyecharts.charts import Pie
from pyecharts.faker import Faker

c = (
Pie()
.add("", [list(z) for z in zip(Faker.choose(), Faker.values())],radius=["30%", "70%"],) #数据格式需要对等
.set_global_opts(title_opts=opts.TitleOpts(title="Pie-基本示例"))
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {d}%")) #显示百分比

)
c.render_notebook()
    <div id="6041731010b84d54b150ab399ebc991a" style="width:900px; height:500px;"></div>
1
2
for i in zip(['a','b','c'],[1,2,3]):
print(i) #两数列对应组合
('a', 1)
('b', 2)
('c', 3)

词云图

  • jieba中文分词
  • WordCloud词云库
1
2
3
4
5
6
7
8
9
10
from pyecharts.charts import WordCloud

words = ['python','jupyter','numpy','pandas','matplotlib','sklearn',
'xgboost','lightGBM','simpy','keras','tensorflow',
'hive','hadoop','spark']
counts = [100,90,65,95,50,60,70,70,20,70,80,80,60,60]

cloud = WordCloud()
cloud.add("文章",zip(words,counts))
cloud.render_notebook()
    <div id="a02363d660c4424f8d9ae310355913cc" style="width:900px; height:500px;"></div>

jieba库

1
import jieba
1
2
3
4
text = '7月17日以来,河南省遭遇极端强降雨,中西部、西北部地区出现成片大暴雨,部分地区特大暴雨。根据《国家防汛抗旱应急预案》有关规定,国家防总决定于7月20日20时启动防汛Ⅲ级应急响应。 河南因为暴雨路面出现大规模积水,多名人员被困,全国各地的人民都在关心河南暴雨情况,希望他们平安度过这关。为帮助河南因强降雨受困群众,九派新闻记者陆续搜集河南各地求助信息并进行电话求证核实,并将核实到的信息滚动发布于此条评论区。如您或者家人受困需发布求助信息,可直接在此条置顶内容评论区留言,或直接联系微信号XXXX。请有力量施予援手的你,积极转发扩散'
text_list = jieba.lcut(text) #分词
arr = np.array(text_list)
arr
array(['7', '月', '17', '日', '以来', ',', '河南省', '遭遇', '极端', '强降雨', ',',
       '中西部', '、', '西北部', '地区', '出现', '成片', '大暴雨', ',', '部分', '地区', '特',
       '大暴雨', '。', '根据', '《', '国家', '防汛', '抗旱', '应急', '预案', '》', '有关',
       '规定', ',', '国家', '防总', '决定', '于', '7', '月', '20', '日', '20', '时',
       '启动', '防汛', 'Ⅲ', '级', '应急', '响应', '。', ' ', '河南', '因为', '暴雨', '路面',
       '出现', '大规模', '积水', ',', '多名', '人员', '被困', ',', '全国', '各地', '的',
       '人民', '都', '在', '关心', '河南', '暴雨', '情况', ',', '希望', '他们', '平安',
       '度过', '这关', '。', '为', '帮助', '河南', '因', '强降雨', '受困', '群众', ',',
       '九派', '新闻记者', '陆续', '搜集', '河南', '各地', '求助', '信息', '并', '进行', '电话',
       '求证', '核实', ',', '并', '将', '核实', '到', '的', '信息', '滚动', '发布', '于',
       '此条', '评论', '区', '。', '如', '您', '或者', '家人', '受困', '需', '发布', '求助',
       '信息', ',', '可', '直接', '在', '此条', '置顶', '内容', '评论', '区', '留言', ',',
       '或', '直接', '联系', '微', '信号', 'XXXX', '。', '请', '有', '力量', '施予',
       '援手', '的', '你', ',', '积极', '转发', '扩散'], dtype='<U4')
1
np.unique(arr,return_counts=True)  #统计计算词语
(array([' ', '17', '20', '7', 'XXXX', 'Ⅲ', '、', '。', '《', '》', '中西部', '为',
        '九派', '于', '人员', '人民', '他们', '以来', '你', '信号', '信息', '全国', '关心',
        '内容', '决定', '出现', '到', '力量', '区', '发布', '受困', '可', '各地', '启动',
        '响应', '因', '因为', '国家', '在', '地区', '多名', '大暴雨', '大规模', '如', '家人',
        '将', '希望', '帮助', '平安', '并', '应急', '度过', '强降雨', '微', '您', '情况',
        '成片', '或', '或者', '扩散', '抗旱', '援手', '搜集', '新闻记者', '施予', '日', '时',
        '暴雨', '月', '有', '有关', '极端', '核实', '根据', '此条', '求助', '求证', '河南',
        '河南省', '滚动', '特', '电话', '留言', '的', '直接', '积极', '积水', '级', '置顶',
        '群众', '联系', '被困', '西北部', '规定', '评论', '请', '路面', '转发', '这关', '进行',
        '遭遇', '部分', '都', '防总', '防汛', '陆续', '需', '预案', ','], dtype='<U4'),
 array([ 1,  1,  2,  2,  1,  1,  1,  5,  1,  1,  1,  1,  1,  2,  1,  1,  1,
         1,  1,  1,  3,  1,  1,  1,  1,  2,  1,  1,  2,  2,  2,  1,  2,  1,
         1,  1,  1,  2,  2,  2,  1,  2,  1,  1,  1,  1,  1,  1,  1,  2,  2,
         1,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  1,  2,
         2,  1,  1,  1,  2,  1,  2,  2,  1,  4,  1,  1,  1,  1,  1,  3,  2,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  2,  1,  1,  1, 12], dtype=int64))

地图

Map

1
[list(z) for z in zip(Faker.provinces, Faker.values())]
[['广东', 44],
 ['北京', 24],
 ['上海', 135],
 ['江西', 25],
 ['湖南', 109],
 ['浙江', 61],
 ['江苏', 20]]
1
2
3
4
5
6
7
8
9
10
11
12
from pyecharts import options as opts
from pyecharts.charts import Map
from pyecharts.faker import Faker

c = (
Map()
.add("商家A",[list(z) for z in zip(Faker.provinces, Faker.values())], "china") # [['北京',29],['上海',42]] #名称,数据,类型
.set_global_opts(title_opts=opts.TitleOpts(title="Map-基本示例"))
#地区重复结果就会直接相加
#世界地图需要英文,中国的就使用中文
)
c.render_notebook()
    <div id="1e35a634ad934e24be75def22a3ca053" style="width:900px; height:500px;"></div>

Geo

1
2
3
4
5
6
7
8
9
10
11
12
13
14
from pyecharts import options as opts
from pyecharts.charts import Geo
from pyecharts.faker import Faker
#默认不显示标签,是地理地图
c = (
Geo()
.add_schema(maptype="china")
.add("geo", [list(z) for z in zip(Faker.provinces, Faker.values())])
.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
.set_global_opts(
visualmap_opts=opts.VisualMapOpts(), title_opts=opts.TitleOpts(title="Geo-基本示例")
)
)
c.render_notebook()
    <div id="8674fac4fa7549b09bc2fb3c52b13a24" style="width:900px; height:500px;"></div>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import math

from pyecharts.faker import Faker
from pyecharts import options as opts
from pyecharts.charts import Line3D


data = []
for t in range(0, 25000):
_t = t / 1000
x = (1 + 0.25 * math.cos(75 * _t)) * math.cos(_t)
y = (1 + 0.25 * math.cos(75 * _t)) * math.sin(_t)
z = _t + 2.0 * math.sin(75 * _t)
data.append([x, y, z])
c = (
Line3D()
.add(
"",
data,
xaxis3d_opts=opts.Axis3DOpts(Faker.clock, type_="value"),
yaxis3d_opts=opts.Axis3DOpts(Faker.week_en, type_="value"),
grid3d_opts=opts.Grid3DOpts(width=100, height=100, depth=100),
)
.set_global_opts(
visualmap_opts=opts.VisualMapOpts(
max_=30, min_=0, range_color=Faker.visual_color
),
title_opts=opts.TitleOpts(title="Line3D-基本示例"),
)
)
c.render_notebook()
    <div id="7d91a1ebb5be4a56960ecf5e78dfa703" style="width:900px; height:500px;"></div>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
import pyecharts.options as opts
from pyecharts.charts import Bar3D

"""
Gallery 使用 pyecharts 1.1.0
参考地址: https://echarts.baidu.com/examples/editor.html?c=bar3d-punch-card&gl=1

目前无法实现的功能:

1、光照和阴影暂时无法设置
"""

hours = [
"12a",
"1a",
"2a",
"3a",
"4a",
"5a",
"6a",
"7a",
"8a",
"9a",
"10a",
"11a",
"12p",
"1p",
"2p",
"3p",
"4p",
"5p",
"6p",
"7p",
"8p",
"9p",
"10p",
"11p",
]
days = ["Saturday", "Friday", "Thursday", "Wednesday", "Tuesday", "Monday", "Sunday"]

data = [
[0, 0, 5],
[0, 1, 1],
[0, 2, 0],
[0, 3, 0],
[0, 4, 0],
[0, 5, 0],
[0, 6, 0],
[0, 7, 0],
[0, 8, 0],
[0, 9, 0],
[0, 10, 0],
[0, 11, 2],
[0, 12, 4],
[0, 13, 1],
[0, 14, 1],
[0, 15, 3],
[0, 16, 4],
[0, 17, 6],
[0, 18, 4],
[0, 19, 4],
[0, 20, 3],
[0, 21, 3],
[0, 22, 2],
[0, 23, 5],
[1, 0, 7],
[1, 1, 0],
[1, 2, 0],
[1, 3, 0],
[1, 4, 0],
[1, 5, 0],
[1, 6, 0],
[1, 7, 0],
[1, 8, 0],
[1, 9, 0],
[1, 10, 5],
[1, 11, 2],
[1, 12, 2],
[1, 13, 6],
[1, 14, 9],
[1, 15, 11],
[1, 16, 6],
[1, 17, 7],
[1, 18, 8],
[1, 19, 12],
[1, 20, 5],
[1, 21, 5],
[1, 22, 7],
[1, 23, 2],
[2, 0, 1],
[2, 1, 1],
[2, 2, 0],
[2, 3, 0],
[2, 4, 0],
[2, 5, 0],
[2, 6, 0],
[2, 7, 0],
[2, 8, 0],
[2, 9, 0],
[2, 10, 3],
[2, 11, 2],
[2, 12, 1],
[2, 13, 9],
[2, 14, 8],
[2, 15, 10],
[2, 16, 6],
[2, 17, 5],
[2, 18, 5],
[2, 19, 5],
[2, 20, 7],
[2, 21, 4],
[2, 22, 2],
[2, 23, 4],
[3, 0, 7],
[3, 1, 3],
[3, 2, 0],
[3, 3, 0],
[3, 4, 0],
[3, 5, 0],
[3, 6, 0],
[3, 7, 0],
[3, 8, 1],
[3, 9, 0],
[3, 10, 5],
[3, 11, 4],
[3, 12, 7],
[3, 13, 14],
[3, 14, 13],
[3, 15, 12],
[3, 16, 9],
[3, 17, 5],
[3, 18, 5],
[3, 19, 10],
[3, 20, 6],
[3, 21, 4],
[3, 22, 4],
[3, 23, 1],
[4, 0, 1],
[4, 1, 3],
[4, 2, 0],
[4, 3, 0],
[4, 4, 0],
[4, 5, 1],
[4, 6, 0],
[4, 7, 0],
[4, 8, 0],
[4, 9, 2],
[4, 10, 4],
[4, 11, 4],
[4, 12, 2],
[4, 13, 4],
[4, 14, 4],
[4, 15, 14],
[4, 16, 12],
[4, 17, 1],
[4, 18, 8],
[4, 19, 5],
[4, 20, 3],
[4, 21, 7],
[4, 22, 3],
[4, 23, 0],
[5, 0, 2],
[5, 1, 1],
[5, 2, 0],
[5, 3, 3],
[5, 4, 0],
[5, 5, 0],
[5, 6, 0],
[5, 7, 0],
[5, 8, 2],
[5, 9, 0],
[5, 10, 4],
[5, 11, 1],
[5, 12, 5],
[5, 13, 10],
[5, 14, 5],
[5, 15, 7],
[5, 16, 11],
[5, 17, 6],
[5, 18, 0],
[5, 19, 5],
[5, 20, 3],
[5, 21, 4],
[5, 22, 2],
[5, 23, 0],
[6, 0, 1],
[6, 1, 0],
[6, 2, 0],
[6, 3, 0],
[6, 4, 0],
[6, 5, 0],
[6, 6, 0],
[6, 7, 0],
[6, 8, 0],
[6, 9, 0],
[6, 10, 1],
[6, 11, 0],
[6, 12, 2],
[6, 13, 1],
[6, 14, 3],
[6, 15, 4],
[6, 16, 0],
[6, 17, 0],
[6, 18, 0],
[6, 19, 0],
[6, 20, 1],
[6, 21, 2],
[6, 22, 2],
[6, 23, 6],
]
data = [[d[1], d[0], d[2]] for d in data]


(
Bar3D(init_opts=opts.InitOpts(width="1600px", height="800px"))
.add(
series_name="",
data=data,
xaxis3d_opts=opts.Axis3DOpts(type_="category", data=hours),
yaxis3d_opts=opts.Axis3DOpts(type_="category", data=days),
zaxis3d_opts=opts.Axis3DOpts(type_="value"),
)
.set_global_opts(
visualmap_opts=opts.VisualMapOpts(
max_=20,
range_color=[
"#313695",
"#4575b4",
"#74add1",
"#abd9e9",
"#e0f3f8",
"#ffffbf",
"#fee090",
"#fdae61",
"#f46d43",
"#d73027",
"#a50026",
],
)
)
.render_notebook()
)

    <div id="286cf24d686747b0be126ef08292eaef" style="width:1600px; height:800px;"></div>

布局

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
from pyecharts import options as opts
from pyecharts.charts import Bar, Line
from pyecharts.faker import Faker

v1 = [2.0, 4.9, 7.0, 23.2, 25.6, 76.7, 135.6, 162.2, 32.6, 20.0, 6.4, 3.3]
v2 = [2.6, 5.9, 9.0, 26.4, 28.7, 70.7, 175.6, 182.2, 48.7, 18.8, 6.0, 2.3]
v3 = [2.0, 2.2, 3.3, 4.5, 6.3, 10.2, 20.3, 23.4, 23.0, 16.5, 12.0, 6.2]


bar = (
Bar()
.add_xaxis(Faker.months)
.add_yaxis("蒸发量", v1)
.add_yaxis("降水量", v2)
.extend_axis(
yaxis=opts.AxisOpts(
axislabel_opts=opts.LabelOpts(formatter="{value} °C"), interval=5
)
)
.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
.set_global_opts(
title_opts=opts.TitleOpts(title="Overlap-bar+line"),
yaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(formatter="{value} ml")),
)
)

line = Line().add_xaxis(Faker.months).add_yaxis("平均温度", v3, yaxis_index=1)
bar.overlap(line)
bar.render_notebook()
    <div id="b1272803e07c4ca380d531d2f6309fcf" style="width:900px; height:500px;"></div>

并行多图Grid

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
from pyecharts import options as opts
from pyecharts.charts import Bar, Geo, Grid
from pyecharts.faker import Faker

bar = (
Bar()
.add_xaxis(Faker.choose())
.add_yaxis("商家A", Faker.values())
.add_yaxis("商家B", Faker.values())
.set_global_opts(legend_opts=opts.LegendOpts(pos_left="20%"))
)


geo = (
Geo()
.add_schema(maptype="china")
.add("geo", [list(z) for z in zip(Faker.provinces, Faker.values())])
.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
.set_global_opts(
visualmap_opts=opts.VisualMapOpts(),
title_opts=opts.TitleOpts(title="Grid-Geo-Bar"),
)
)
grid = (
Grid(init_opts=opts.InitOpts(width="900px",height="900px")) #加大画布大小也行
.add(bar,grid_opts=opts.GridOpts(pos_left="10%",pos_top="80%",is_show=True)) #调整间距,来避免重合
.add(geo,grid_opts=opts.GridOpts(pos_left="90%",pos_top="10%",is_show=True)) #调整间距,来避免重合
# .add(bar,grid_opts=opts.GridOpts(pos_left="10%",pos_top="80%"))
)
1
grid.render_notebook()
    <div id="2b97e72a738d452eb0c8940456b653cb" style="width:900px; height:900px;"></div>

顺序多图

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
from pyecharts import options as opts
from pyecharts.charts import Bar, Geo, Page
from pyecharts.faker import Faker

bar = (
Bar()
.add_xaxis(Faker.choose())
.add_yaxis("商家A", Faker.values())
.add_yaxis("商家B", Faker.values())
.set_global_opts(legend_opts=opts.LegendOpts(pos_left="20%"))
)


geo = (
Geo()
.add_schema(maptype="china")
.add("geo", [list(z) for z in zip(Faker.provinces, Faker.values())])
.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
.set_global_opts(
visualmap_opts=opts.VisualMapOpts(),
title_opts=opts.TitleOpts(title="Grid-Geo-Bar"),
)
)
page = (
Page()
.add(bar) #没有参数,不能设置
.add(geo) #没有参数,不能设置
)
1
page.render_notebook()
    <div id="199ff42a508d44f6ad14a1d74408c399" style="width:900px; height:500px;"></div>
    <div id="f5cb6a87a462408c9727b8f9c49ce892" style="width:900px; height:500px;"></div>

选项卡

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
from pyecharts import options as opts
from pyecharts.charts import Bar, Geo, Tab
from pyecharts.faker import Faker

bar = (
Bar()
.add_xaxis(Faker.choose())
.add_yaxis("商家A", Faker.values())
.add_yaxis("商家B", Faker.values())
.set_global_opts(legend_opts=opts.LegendOpts(pos_left="20%"))
)
geo = (
Geo()
.add_schema(maptype="china")
.add("geo", [list(z) for z in zip(Faker.provinces, Faker.values())])
.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
.set_global_opts(
visualmap_opts=opts.VisualMapOpts(),
title_opts=opts.TitleOpts(title="Grid-Geo-Bar"),
)
)
tab = (
Tab()
.add(geo,"地图")
.add(bar,"柱形图")
)
1
tab.render_notebook() #组件选项卡
    <div id="bacaaeb1f73d409e9d08ae0162754f93" class="chart-container" style="width:900px; height:500px;"></div>
    <div id="9950ad587efa42a69f96398587262ac4" class="chart-container" style="width:900px; height:500px;"></div>

时间线轮播

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
from pyecharts import options as opts
from pyecharts.charts import Pie, Timeline
from pyecharts.faker import Faker

attr = Faker.choose()
t1 = Timeline()
for i in range(2015,2021):
pie = (
Pie()
.add(
"商家A",
[list(z) for z in zip(attr,Faker.values())],
# rosetype="area",
# radius=["30%","55%"],
)
.set_global_opts(title_opts=opts.TitleOpts('某商店{}年营业额'.format(i)))
)
t1.add(pie,"{}年".format(i))
t1.render_notebook() #下面有时间线
    <div id="b75cabdfb8c74bb090e8526962ae8347" style="width:900px; height:500px;"></div>

在ppt中插入Pyecharts图表
https://www.cnblogs.com/mark-wq/p/14168535.html

html要改ie内核:
加上:< meta http-equiv=”X-UA-Compatible” content=”IE=EmulateIE10”/>

顺便评个分吧!👇
<i class="fa fa-angle-left"></i>123…6<i class="fa fa-angle-right"></i>

51 posts
12 categories
30 tags
© 2021 Alogomachine
Powered by Hexo
粤ICP备 - 2021089270 | 粤公网安备 2021089270